# need
mr
seq
contrib
hyp_contrib
tasks
from basepair.imports import *
from basepair.modisco.utils import load_imp_scores
def modisco_score2(modisco_dir,
imp_scores,
output_file,
trim_frac=0.08,
importance='weighted',
n_jobs=20):
"""Modisco score instances
Args:
modisco_dir: modisco directory
imp_scores: hdf5 file of importance scores (contains weighted)
output_file: output file path for the tsv file. If the suffix is
tsv.gz, then also gzip the file
trim_frac: how much to trim the pattern
importance: which importance scores to use
n_jobs: number of parallel jobs to use
Writes a gzipped tsv file (tsv.gz)
"""
seq, contrib, hyp_contrib, profile, ranges = load_imp_scores(imp_scores, importance=importance)
modisco_dir = Path(modisco_dir)
dfm = pd.read_csv(modisco_dir / "centroid_seqlet_matches.csv")
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
tasks = mr.tasks()
dfl = []
for pattern_name in tqdm(mr.patterns()):
pattern = mr.get_pattern(pattern_name).trim_seq_ic(0.08)
match, importance = pattern.scan_importance(contrib, hyp_contrib, tasks,
n_jobs=n_jobs, verbose=False, plot=False)
seq_match = pattern.scan_seq(seq, n_jobs=n_jobs, verbose=False)
dfm = pattern.get_instances(tasks, match, importance, seq_match,
norm_df=dfm[dfm.pattern == pattern_name],
verbose=False, plot=False)
dfl.append(dfm)
# merge and write the results
dfp = pd.concat(dfl)
# append the ranges
ranges.columns = ["example_" + v for v in ranges.columns]
dfp = dfp.merge(ranges, left_on="example_idx", how='left', right_on="example_id")
if output_file.endswith(".gz"):
compression = 'gzip'
else:
compression = None
dfp.to_csv(output_file, index=False, sep='\t', compression=compression)
a = pd.Series(np.arange(10))
a[0]=20
# sorted values
vals = a.sort_values().values
p = np.concatenate([np.arange(len(vals)) / len(vals), [1]])
from basepair.stats import quantile_norm
quantile_norm(vals, np.arange(10))
vals = np.array([3, 0,0,1,2])
np.sort(np.arange(10))
p
(np.argsort(a) / len(a)).append(pd.Series([0]))
def modisco_score_instances(modisco_dir, imp_scores, output_dir, trim_frac=0.08, n_jobs=1):
pass
seq, contrib, hyp_contrib, profile = load_imp_scores(imp_scores)
dfl = []
match_l, importance_l, seq_match_l = [], [], []
for tf, pattern_name in pattern_names:
pattern = mr.get_pattern(pattern_name).trim_seq_ic(0.08)
match, importance = pattern.scan_importance(contrib, hyp_contrib, tasks,
n_jobs=20, verbose=True)
seq_match = pattern.scan_seq(seq, n_jobs=20, verbose=True)
dfm = pattern.get_instances(tasks, match, importance, seq_match, fdr=0.01, verbose=True)
dfm['tf'] = tf
dfl.append(dfm)
match_l.append(match)
importance_l.append(importance)
seq_match_l.append(seq_match)
dfp = pd.concat(dfl)
# cache
cached = True
dfp.to_hdf(cache_file, "/scores")