# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals
# Common paths on Surya
ddir = Path("/srv/scratch/avsec/workspace/chipnexus/data/")
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"
mr = ModiscoResult(modisco_dir / 'modisco.h5')
mr.open()
# load scanned instances
dfi = load_instances(modisco_dir / 'instances.parq', dedup=False)
dfi = filter_nonoverlapping_intervals(dfi)
dfi.head()
%matplotlib inline
o = []
for pattern in tqdm(mr.patterns()):
# get seqlet regions
dfips = mr.get_seqlet_intervals(pattern, trim_frac=0.08, as_df=True)
# remove duplicates
dfips = dfips[~dfips[['chrom', 'start', 'end', 'strand']].duplicated()]
# get the right pattern
dfip = dfi[dfi.pattern == pattern][['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand', 'imp_weighted_p', 'match_weighted_p', 'seq_match_p']]
# remove any remaining duplicates
dfip = dfip[~dfip[['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()]
# require a minimal match to be the same as for seqlets
dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > 0)]
# merge the two
common = pd.merge(dfip, dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='inner')
o.append({"pattern": pattern, "n_seqlets": len(dfips), "n_newly_matched": len(dfip), "match_weighted_p": 0, "seqlets_covered": len(common)})
dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > .01)]
common = pd.merge(dfip, dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='inner')
o.append({"pattern": pattern, "n_seqlets": len(dfips), "n_newly_matched": len(dfip), "match_weighted_p": .01, "seqlets_covered": len(common)})
dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > .2)]
common = pd.merge(dfip, dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='inner')
o.append({"pattern": pattern, "n_seqlets": len(dfips), "n_newly_matched": len(dfip), "match_weighted_p": .2, "seqlets_covered": len(common)})
df = pd.DataFrame(o)
df = df.set_index("pattern")
df['matched'] = df.seqlets_covered
df['seqlets_covered'] = df.matched / df.n_seqlets
df[df.match_weighted_p == 0].plot.scatter("n_seqlets", "n_newly_matched")
plt.xscale("log")
plt.yscale("log")
plt.plot([10, 1e5], [10, 1e5]);
df[df.match_weighted_p == 0].plot.scatter("n_seqlets", 'seqlets_covered')
plt.xscale("log")
df[df.match_weighted_p == 0.01].plot.scatter("n_seqlets", "n_newly_matched")
plt.xscale("log")
plt.yscale("log")
plt.plot([10, 1e5], [10, 1e5]);
df[df.match_weighted_p == 0.01].plot.scatter("n_seqlets", 'seqlets_covered')
plt.xscale("log")
df[df.match_weighted_p == .2].plot.scatter("n_seqlets", "n_newly_matched")
plt.xscale("log")
plt.yscale("log")
plt.plot([10, 1e5], [10, 1e5]);
df[df.match_weighted_p == .2].plot.scatter("n_seqlets", 'seqlets_covered')
plt.xscale("log")
For a motif, check the heatmaps for:
from basepair.cli.imp_score import ImpScoreFile
imp_scores = ImpScoreFile.from_modisco_dir(modisco_dir)
profiles = imp_scores.get_profiles()
pattern = 'metacluster_0/pattern_0'
tfc = 'Oct4'
mr.get_pattern(pattern).plot("seq_ic");
dfips = mr.get_seqlet_intervals(pattern, trim_frac=0.08, as_df=True)
dfips = dfips[~dfips[['chrom', 'start', 'end', 'strand']].duplicated()]
modisco_seqlets = [s for i, s in enumerate(mr._get_seqlets(pattern, trim_frac=0.08)) if i in dfips.index]
dfips.head()
# get the right pattern
dfip = dfi[dfi.pattern == pattern][['pattern', 'example_idx', 'pattern_start', 'pattern_end', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand', 'imp_weighted_p', 'match_weighted_p', 'seq_match_p']]
# remove any remaining duplicates
dfip = dfip[~dfip[['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()]
# require a minimal match to be the same as for seqlets
dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > 0.0)]
# dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > .2)]
from basepair.modisco.pattern_instances import dfi2seqlets, annotate_profile
from basepair.modisco.results import Seqlet, resize_seqlets
from basepair.plot.profiles import extract_signal
from basepair.plot.profiles import plot_stranded_profile, multiple_plot_stranded_profile
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile
seqlets = resize_seqlets(modisco_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.1]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
# merge
dfips['seqlet_id'] = np.arange(len(dfips))
dfip_merged = pd.merge(dfip, dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='left')
df_not_found = dfip_merged[dfip_merged.start.isnull()]
df_not_found['pattern'] = df_not_found['pattern_x']
dfip_merged_s = pd.merge(dfip[dfip.match_weighted_p > .01], dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='right')
df_not_found_s = dfip_merged_s[dfip_merged_s.pattern_x.isnull()]
df_not_found_s['pattern'] = df_not_found_s['pattern_x']
not_rediscovered_seqlets = [s for i, s in enumerate(modisco_seqlets) if i in df_not_found_s.seqlet_id]
len(not_rediscovered_seqlets)
seqlets = resize_seqlets(not_rediscovered_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
len(df_not_found[df_not_found.match_weighted_p > .2])
seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .2]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
len(df_not_found[df_not_found.match_weighted_p > .01])
seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
pattern = 'metacluster_0/pattern_1'
tfc = 'Sox2'
mr.get_pattern(pattern).plot("seq_ic");
dfips = mr.get_seqlet_intervals(pattern, trim_frac=0.08, as_df=True)
dfips = dfips[~dfips[['chrom', 'start', 'end', 'strand']].duplicated()]
modisco_seqlets = [s for i, s in enumerate(mr._get_seqlets(pattern, trim_frac=0.08)) if i in dfips.index]
dfips.head()
# get the right pattern
dfip = dfi[dfi.pattern == pattern][['pattern', 'example_idx', 'pattern_start', 'pattern_end', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand', 'imp_weighted_p', 'match_weighted_p', 'seq_match_p']]
# remove any remaining duplicates
dfip = dfip[~dfip[['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()]
# require a minimal match to be the same as for seqlets
dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > 0.0)]
# dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > .2)]
from basepair.modisco.pattern_instances import dfi2seqlets, annotate_profile
from basepair.modisco.results import Seqlet, resize_seqlets
from basepair.plot.profiles import extract_signal
from basepair.plot.profiles import plot_stranded_profile, multiple_plot_stranded_profile
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile
seqlets = resize_seqlets(modisco_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.1]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
# merge
dfips['seqlet_id'] = np.arange(len(dfips))
dfip_merged = pd.merge(dfip, dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='left')
df_not_found = dfip_merged[dfip_merged.start.isnull()]
df_not_found['pattern'] = df_not_found['pattern_x']
dfip_merged_s = pd.merge(dfip[dfip.match_weighted_p > .01], dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='right')
df_not_found_s = dfip_merged_s[dfip_merged_s.pattern_x.isnull()]
df_not_found_s['pattern'] = df_not_found_s['pattern_x']
not_rediscovered_seqlets = [s for i, s in enumerate(modisco_seqlets) if i in df_not_found_s.seqlet_id]
len(not_rediscovered_seqlets)
seqlets = resize_seqlets(not_rediscovered_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
len(df_not_found[df_not_found.match_weighted_p > .2])
seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .2]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
len(df_not_found[df_not_found.match_weighted_p > .01])
seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
from basepair.exp.paper.config import motifs
motifs
pattern = 'metacluster_2/pattern_0'
tfc = 'Nanog'
mr.get_pattern(pattern).plot("seq_ic");
dfips = mr.get_seqlet_intervals(pattern, trim_frac=0.08, as_df=True)
dfips = dfips[~dfips[['chrom', 'start', 'end', 'strand']].duplicated()]
modisco_seqlets = [s for i, s in enumerate(mr._get_seqlets(pattern, trim_frac=0.08)) if i in dfips.index]
dfips.head()
# get the right pattern
dfip = dfi[dfi.pattern == pattern][['pattern', 'example_idx', 'pattern_start', 'pattern_end', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand', 'imp_weighted_p', 'match_weighted_p', 'seq_match_p']]
# remove any remaining duplicates
dfip = dfip[~dfip[['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()]
# require a minimal match to be the same as for seqlets
dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > 0.0)]
# dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > .2)]
from basepair.modisco.pattern_instances import dfi2seqlets, annotate_profile
from basepair.modisco.results import Seqlet, resize_seqlets
from basepair.plot.profiles import extract_signal
from basepair.plot.profiles import plot_stranded_profile, multiple_plot_stranded_profile
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile
seqlets = resize_seqlets(modisco_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.1]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
# merge
dfips['seqlet_id'] = np.arange(len(dfips))
dfip_merged = pd.merge(dfip, dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='left')
df_not_found = dfip_merged[dfip_merged.start.isnull()]
df_not_found['pattern'] = df_not_found['pattern_x']
dfip_merged_s = pd.merge(dfip[dfip.match_weighted_p > .01], dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='right')
df_not_found_s = dfip_merged_s[dfip_merged_s.pattern_x.isnull()]
df_not_found_s['pattern'] = df_not_found_s['pattern_x']
not_rediscovered_seqlets = [s for i, s in enumerate(modisco_seqlets) if i in df_not_found_s.seqlet_id]
len(not_rediscovered_seqlets)
seqlets = resize_seqlets(not_rediscovered_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
len(df_not_found[df_not_found.match_weighted_p > .2])
seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .2]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
len(df_not_found[df_not_found.match_weighted_p > .01])
seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
pattern = 'metacluster_1/pattern_0'
tfc = 'Nanog'
mr.get_pattern(pattern).plot("seq_ic");
dfips = mr.get_seqlet_intervals(pattern, trim_frac=0.08, as_df=True)
dfips = dfips[~dfips[['chrom', 'start', 'end', 'strand']].duplicated()]
modisco_seqlets = [s for i, s in enumerate(mr._get_seqlets(pattern, trim_frac=0.08)) if i in dfips.index]
dfips.head()
# get the right pattern
dfip = dfi[dfi.pattern == pattern][['pattern', 'example_idx', 'pattern_start', 'pattern_end', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand', 'imp_weighted_p', 'match_weighted_p', 'seq_match_p']]
# remove any remaining duplicates
dfip = dfip[~dfip[['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()]
# require a minimal match to be the same as for seqlets
dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > 0.0)]
# dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > .2)]
from basepair.modisco.pattern_instances import dfi2seqlets, annotate_profile
from basepair.modisco.results import Seqlet, resize_seqlets
from basepair.plot.profiles import extract_signal
from basepair.plot.profiles import plot_stranded_profile, multiple_plot_stranded_profile
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile
seqlets = resize_seqlets(modisco_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.1]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
# merge
dfips['seqlet_id'] = np.arange(len(dfips))
dfip_merged = pd.merge(dfip, dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='left')
df_not_found = dfip_merged[dfip_merged.start.isnull()]
df_not_found['pattern'] = df_not_found['pattern_x']
dfip_merged_s = pd.merge(dfip[dfip.match_weighted_p > .01], dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='right')
df_not_found_s = dfip_merged_s[dfip_merged_s.pattern_x.isnull()]
df_not_found_s['pattern'] = df_not_found_s['pattern_x']
not_rediscovered_seqlets = [s for i, s in enumerate(modisco_seqlets) if i in df_not_found_s.seqlet_id]
len(not_rediscovered_seqlets)
seqlets = resize_seqlets(not_rediscovered_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
len(df_not_found[df_not_found.match_weighted_p > .2])
seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .2]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
len(df_not_found[df_not_found.match_weighted_p > .01])
seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));