Goal

  • check pattern co-ooccurence
In [2]:
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals
In [7]:
# Common paths on Surya
ddir = Path("/srv/scratch/avsec/workspace/chipnexus/data/")
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"
In [66]:
mr = ModiscoResult(modisco_dir / 'modisco.h5')
mr.open()
In [65]:
# load scanned instances
dfi = load_instances(modisco_dir / 'instances.parq', dedup=False)
dfi = filter_nonoverlapping_intervals(dfi)
In [67]:
dfi.head()
Out[67]:
pattern example_idx pattern_start pattern_end strand pattern_len pattern_center match_weighted match_weighted_p match_weighted_cat match_max match_max_task imp_weighted imp_weighted_p imp_weighted_cat imp_max imp_max_task seq_match seq_match_p seq_match_cat match/Klf4 match/Nanog match/Oct4 match/Sox2 imp/Klf4 imp/Nanog imp/Oct4 imp/Sox2 example_chrom example_start example_end example_strand example_interval_from_task pattern_short pattern_start_abs pattern_end_abs
0 metacluster_0/pattern_0 0 104 119 + 15 111 0.2469 0.0010 low 0.4484 Oct4 0.0365 NaN NaN 0.0524 Oct4 3.9825 0.0085 low 0.2654 -0.3593 0.4484 0.3725 0.0235 0.0246 0.0524 0.0313 chrX 143482544 143483544 * Oct4 m0_p0 143482648 143482663
1 metacluster_0/pattern_0 0 263 278 - 15 270 0.2518 0.0011 low 0.2893 Oct4 0.0217 NaN NaN 0.0342 Nanog 1.1066 0.0007 low 0.1618 0.2170 0.2893 0.2803 0.0143 0.0342 0.0200 0.0203 chrX 143482544 143483544 * Oct4 m0_p0 143482807 143482822
2 metacluster_0/pattern_0 0 282 297 + 15 289 0.3466 0.0069 low 0.3870 Sox2 0.0272 NaN NaN 0.0296 Sox2 5.0100 0.0188 low 0.2257 0.3454 0.3723 0.3870 0.0233 0.0258 0.0279 0.0296 chrX 143482544 143483544 * Oct4 m0_p0 143482826 143482841
3 metacluster_0/pattern_0 0 445 460 + 15 452 0.2399 0.0008 low 0.2469 Sox2 0.1852 0.0008 high 0.4145 Nanog 1.7775 0.0012 low 0.2458 0.2408 0.2314 0.2469 0.1234 0.4145 0.0949 0.1896 chrX 143482544 143483544 * Oct4 m0_p0 143482989 143483004
4 metacluster_0/pattern_0 0 475 490 - 15 482 0.2903 0.0025 low 0.3322 Oct4 0.2889 0.0179 high 0.5196 Nanog 3.3390 0.0057 low 0.2092 0.2681 0.3322 0.2991 0.1613 0.5196 0.2049 0.3241 chrX 143482544 143483544 * Oct4 m0_p0 143483019 143483034
In [74]:
%matplotlib inline
In [261]:
o = []
for pattern in tqdm(mr.patterns()):
    # get seqlet regions
    dfips = mr.get_seqlet_intervals(pattern, trim_frac=0.08, as_df=True)
    # remove duplicates
    dfips = dfips[~dfips[['chrom', 'start', 'end', 'strand']].duplicated()]

    # get the right pattern
    dfip = dfi[dfi.pattern == pattern][['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand', 'imp_weighted_p', 'match_weighted_p', 'seq_match_p']]
    # remove any remaining duplicates
    dfip = dfip[~dfip[['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()]
    # require a minimal match to be the same as for seqlets
    dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > 0)]

    # merge the two
    common = pd.merge(dfip, dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='inner')
    o.append({"pattern": pattern, "n_seqlets": len(dfips), "n_newly_matched": len(dfip), "match_weighted_p": 0, "seqlets_covered": len(common)})
    
    dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > .01)]
    common = pd.merge(dfip, dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='inner')
    o.append({"pattern": pattern, "n_seqlets": len(dfips), "n_newly_matched": len(dfip), "match_weighted_p": .01, "seqlets_covered": len(common)})
    
    dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > .2)]
    common = pd.merge(dfip, dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='inner')
    o.append({"pattern": pattern, "n_seqlets": len(dfips), "n_newly_matched": len(dfip), "match_weighted_p": .2, "seqlets_covered": len(common)})
100%|██████████| 122/122 [10:21<00:00,  6.70s/it]
In [262]:
df = pd.DataFrame(o)
In [263]:
df = df.set_index("pattern")
In [264]:
df['matched'] = df.seqlets_covered

df['seqlets_covered'] = df.matched / df.n_seqlets

match_weighted_p > 0

In [269]:
df[df.match_weighted_p == 0].plot.scatter("n_seqlets", "n_newly_matched")
plt.xscale("log")
plt.yscale("log")
plt.plot([10, 1e5], [10, 1e5]);
In [270]:
df[df.match_weighted_p == 0].plot.scatter("n_seqlets", 'seqlets_covered')
plt.xscale("log")

match_weighted_p > .01

In [271]:
df[df.match_weighted_p == 0.01].plot.scatter("n_seqlets", "n_newly_matched")
plt.xscale("log")
plt.yscale("log")
plt.plot([10, 1e5], [10, 1e5]);
In [272]:
df[df.match_weighted_p == 0.01].plot.scatter("n_seqlets", 'seqlets_covered')
plt.xscale("log")

match_weighted_p > .2

In [273]:
df[df.match_weighted_p == .2].plot.scatter("n_seqlets", "n_newly_matched")
plt.xscale("log")
plt.yscale("log")
plt.plot([10, 1e5], [10, 1e5]);
In [274]:
df[df.match_weighted_p == .2].plot.scatter("n_seqlets", 'seqlets_covered')
plt.xscale("log")

Profile plots

For a motif, check the heatmaps for:

  • seqlets
  • motif instances
  • motif instances not as seqlets
In [212]:
from basepair.cli.imp_score import ImpScoreFile

imp_scores = ImpScoreFile.from_modisco_dir(modisco_dir)
profiles = imp_scores.get_profiles()
In [307]:
pattern = 'metacluster_0/pattern_0'
tfc = 'Oct4'
In [308]:
mr.get_pattern(pattern).plot("seq_ic");
In [309]:
dfips = mr.get_seqlet_intervals(pattern, trim_frac=0.08, as_df=True)
dfips = dfips[~dfips[['chrom', 'start', 'end', 'strand']].duplicated()]
modisco_seqlets = [s for i, s in enumerate(mr._get_seqlets(pattern, trim_frac=0.08)) if i in dfips.index]
In [310]:
dfips.head()
Out[310]:
chrom start end interval_from_task seqname strand pattern
0 chr17 35504049 35504064 Oct4 2 + metacluster_0/pattern_0
4 chr5 75085360 75085375 Sox2 31157 + metacluster_0/pattern_0
5 chr8 113598589 113598604 Sox2 29329 + metacluster_0/pattern_0
6 chr5 131526931 131526946 Oct4 164 + metacluster_0/pattern_0
12 chrX 61393375 61393390 Sox2 23380 + metacluster_0/pattern_0
In [311]:
# get the right pattern
dfip = dfi[dfi.pattern == pattern][['pattern', 'example_idx', 'pattern_start', 'pattern_end', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand', 'imp_weighted_p', 'match_weighted_p', 'seq_match_p']]
# remove any remaining duplicates
dfip = dfip[~dfip[['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()]
# require a minimal match to be the same as for seqlets
dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > 0.0)]
In [312]:
# dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > .2)]
In [313]:
from basepair.modisco.pattern_instances import dfi2seqlets, annotate_profile
from basepair.modisco.results import Seqlet, resize_seqlets
from basepair.plot.profiles import extract_signal
from basepair.plot.profiles import  plot_stranded_profile, multiple_plot_stranded_profile
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile

Modisco

In [314]:
seqlets = resize_seqlets(modisco_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

Scanned

match_weighted_p > .2

In [315]:
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.1]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > .01

In [316]:
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > 0

In [317]:
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
In [318]:
# merge
dfips['seqlet_id'] = np.arange(len(dfips))
dfip_merged = pd.merge(dfip, dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='left')
df_not_found = dfip_merged[dfip_merged.start.isnull()]
df_not_found['pattern'] = df_not_found['pattern_x']
In [319]:
dfip_merged_s = pd.merge(dfip[dfip.match_weighted_p > .01], dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='right')
df_not_found_s = dfip_merged_s[dfip_merged_s.pattern_x.isnull()]
df_not_found_s['pattern'] = df_not_found_s['pattern_x']

not_rediscovered_seqlets = [s for i, s in enumerate(modisco_seqlets) if i in df_not_found_s.seqlet_id]

Not overlapping

Seqlets

In [320]:
len(not_rediscovered_seqlets)
Out[320]:
414
In [321]:
seqlets = resize_seqlets(not_rediscovered_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

Scanned

imp_weighted > .2

In [322]:
len(df_not_found[df_not_found.match_weighted_p > .2])
Out[322]:
6723
In [323]:
seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .2]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > .01

In [324]:
len(df_not_found[df_not_found.match_weighted_p > .01])
Out[324]:
22289
In [325]:
seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

Sox2

In [326]:
pattern = 'metacluster_0/pattern_1'
tfc = 'Sox2'
In [327]:
mr.get_pattern(pattern).plot("seq_ic");
In [328]:
dfips = mr.get_seqlet_intervals(pattern, trim_frac=0.08, as_df=True)
dfips = dfips[~dfips[['chrom', 'start', 'end', 'strand']].duplicated()]
modisco_seqlets = [s for i, s in enumerate(mr._get_seqlets(pattern, trim_frac=0.08)) if i in dfips.index]
In [329]:
dfips.head()
Out[329]:
chrom start end interval_from_task seqname strand pattern
0 chr17 29187380 29187402 Nanog 33172 + metacluster_0/pattern_1
4 chr12 56268639 56268661 Nanog 36552 + metacluster_0/pattern_1
5 chr13 14290196 14290218 Oct4 5238 + metacluster_0/pattern_1
6 chr6 99310277 99310299 Nanog 38522 + metacluster_0/pattern_1
12 chr19 55418947 55418969 Klf4 78746 + metacluster_0/pattern_1
In [330]:
# get the right pattern
dfip = dfi[dfi.pattern == pattern][['pattern', 'example_idx', 'pattern_start', 'pattern_end', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand', 'imp_weighted_p', 'match_weighted_p', 'seq_match_p']]
# remove any remaining duplicates
dfip = dfip[~dfip[['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()]
# require a minimal match to be the same as for seqlets
dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > 0.0)]
In [331]:
# dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > .2)]
In [332]:
from basepair.modisco.pattern_instances import dfi2seqlets, annotate_profile
from basepair.modisco.results import Seqlet, resize_seqlets
from basepair.plot.profiles import extract_signal
from basepair.plot.profiles import  plot_stranded_profile, multiple_plot_stranded_profile
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile

Modisco

In [333]:
seqlets = resize_seqlets(modisco_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

Scanned

match_weighted_p > .2

In [334]:
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.1]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > .01

In [335]:
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > 0

In [336]:
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
In [337]:
# merge
dfips['seqlet_id'] = np.arange(len(dfips))
dfip_merged = pd.merge(dfip, dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='left')
df_not_found = dfip_merged[dfip_merged.start.isnull()]
df_not_found['pattern'] = df_not_found['pattern_x']
In [338]:
dfip_merged_s = pd.merge(dfip[dfip.match_weighted_p > .01], dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='right')
df_not_found_s = dfip_merged_s[dfip_merged_s.pattern_x.isnull()]
df_not_found_s['pattern'] = df_not_found_s['pattern_x']

not_rediscovered_seqlets = [s for i, s in enumerate(modisco_seqlets) if i in df_not_found_s.seqlet_id]

Not overlapping

Seqlets

In [339]:
len(not_rediscovered_seqlets)
Out[339]:
184
In [340]:
seqlets = resize_seqlets(not_rediscovered_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

Scanned

imp_weighted > .2

In [341]:
len(df_not_found[df_not_found.match_weighted_p > .2])
Out[341]:
8160
In [342]:
seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .2]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > .01

In [343]:
len(df_not_found[df_not_found.match_weighted_p > .01])
Out[343]:
34064
In [344]:
seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
In [346]:
from basepair.exp.paper.config import motifs
In [347]:
motifs
Out[347]:
OrderedDict([('Oct4-Sox2', 'm0_p0'),
             ('Sox2', 'm0_p1'),
             ('Nanog', 'm2_p0'),
             ('Klf4', 'm1_p0')])

Nanog

In [348]:
pattern = 'metacluster_2/pattern_0'
tfc = 'Nanog'
In [349]:
mr.get_pattern(pattern).plot("seq_ic");
In [350]:
dfips = mr.get_seqlet_intervals(pattern, trim_frac=0.08, as_df=True)
dfips = dfips[~dfips[['chrom', 'start', 'end', 'strand']].duplicated()]
modisco_seqlets = [s for i, s in enumerate(mr._get_seqlets(pattern, trim_frac=0.08)) if i in dfips.index]
In [351]:
dfips.head()
Out[351]:
chrom start end interval_from_task seqname strand pattern
0 chr7 78183558 78183567 Nanog 41261 - metacluster_2/pattern_0
1 chr2 17936888 17936897 Klf4 75670 - metacluster_2/pattern_0
2 chr3 122492241 122492250 Oct4 4242 - metacluster_2/pattern_0
3 chr4 132018778 132018787 Oct4 8779 - metacluster_2/pattern_0
4 chr9 6409208 6409217 Nanog 41511 - metacluster_2/pattern_0
In [352]:
# get the right pattern
dfip = dfi[dfi.pattern == pattern][['pattern', 'example_idx', 'pattern_start', 'pattern_end', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand', 'imp_weighted_p', 'match_weighted_p', 'seq_match_p']]
# remove any remaining duplicates
dfip = dfip[~dfip[['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()]
# require a minimal match to be the same as for seqlets
dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > 0.0)]
In [353]:
# dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > .2)]
In [354]:
from basepair.modisco.pattern_instances import dfi2seqlets, annotate_profile
from basepair.modisco.results import Seqlet, resize_seqlets
from basepair.plot.profiles import extract_signal
from basepair.plot.profiles import  plot_stranded_profile, multiple_plot_stranded_profile
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile

Modisco

In [355]:
seqlets = resize_seqlets(modisco_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

Scanned

match_weighted_p > .2

In [356]:
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.1]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > .01

In [357]:
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > 0

In [358]:
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
In [359]:
# merge
dfips['seqlet_id'] = np.arange(len(dfips))
dfip_merged = pd.merge(dfip, dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='left')
df_not_found = dfip_merged[dfip_merged.start.isnull()]
df_not_found['pattern'] = df_not_found['pattern_x']
In [360]:
dfip_merged_s = pd.merge(dfip[dfip.match_weighted_p > .01], dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='right')
df_not_found_s = dfip_merged_s[dfip_merged_s.pattern_x.isnull()]
df_not_found_s['pattern'] = df_not_found_s['pattern_x']

not_rediscovered_seqlets = [s for i, s in enumerate(modisco_seqlets) if i in df_not_found_s.seqlet_id]

Not overlapping

Seqlets

In [361]:
len(not_rediscovered_seqlets)
Out[361]:
795
In [362]:
seqlets = resize_seqlets(not_rediscovered_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

Scanned

imp_weighted > .2

In [363]:
len(df_not_found[df_not_found.match_weighted_p > .2])
Out[363]:
33803
In [364]:
seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .2]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > .01

In [365]:
len(df_not_found[df_not_found.match_weighted_p > .01])
Out[365]:
104267
In [366]:
seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

Klf4

In [367]:
pattern = 'metacluster_1/pattern_0'
tfc = 'Nanog'
In [368]:
mr.get_pattern(pattern).plot("seq_ic");
In [369]:
dfips = mr.get_seqlet_intervals(pattern, trim_frac=0.08, as_df=True)
dfips = dfips[~dfips[['chrom', 'start', 'end', 'strand']].duplicated()]
modisco_seqlets = [s for i, s in enumerate(mr._get_seqlets(pattern, trim_frac=0.08)) if i in dfips.index]
In [370]:
dfips.head()
Out[370]:
chrom start end interval_from_task seqname strand pattern
0 chr11 35838497 35838507 Klf4 60114 + metacluster_1/pattern_0
1 chr11 88758531 88758541 Oct4 16636 - metacluster_1/pattern_0
4 chr4 34881427 34881437 Klf4 53871 + metacluster_1/pattern_0
5 chr14 75845032 75845042 Klf4 60079 - metacluster_1/pattern_0
6 chr19 17346877 17346887 Klf4 53173 - metacluster_1/pattern_0
In [371]:
# get the right pattern
dfip = dfi[dfi.pattern == pattern][['pattern', 'example_idx', 'pattern_start', 'pattern_end', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand', 'imp_weighted_p', 'match_weighted_p', 'seq_match_p']]
# remove any remaining duplicates
dfip = dfip[~dfip[['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()]
# require a minimal match to be the same as for seqlets
dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > 0.0)]
In [372]:
# dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > .2)]
In [373]:
from basepair.modisco.pattern_instances import dfi2seqlets, annotate_profile
from basepair.modisco.results import Seqlet, resize_seqlets
from basepair.plot.profiles import extract_signal
from basepair.plot.profiles import  plot_stranded_profile, multiple_plot_stranded_profile
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile

Modisco

In [374]:
seqlets = resize_seqlets(modisco_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

Scanned

match_weighted_p > .2

In [375]:
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.1]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > .01

In [376]:
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > 0

In [377]:
# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));
In [378]:
# merge
dfips['seqlet_id'] = np.arange(len(dfips))
dfip_merged = pd.merge(dfip, dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='left')
df_not_found = dfip_merged[dfip_merged.start.isnull()]
df_not_found['pattern'] = df_not_found['pattern_x']
In [379]:
dfip_merged_s = pd.merge(dfip[dfip.match_weighted_p > .01], dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='right')
df_not_found_s = dfip_merged_s[dfip_merged_s.pattern_x.isnull()]
df_not_found_s['pattern'] = df_not_found_s['pattern_x']

not_rediscovered_seqlets = [s for i, s in enumerate(modisco_seqlets) if i in df_not_found_s.seqlet_id]

Not overlapping

Seqlets

In [380]:
len(not_rediscovered_seqlets)
Out[380]:
2321
In [381]:
seqlets = resize_seqlets(not_rediscovered_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

Scanned

imp_weighted > .2

In [382]:
len(df_not_found[df_not_found.match_weighted_p > .2])
Out[382]:
18842
In [383]:
seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .2]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > .01

In [384]:
len(df_not_found[df_not_found.match_weighted_p > .01])
Out[384]:
49603
In [385]:
seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));