Goal¶

check pattern co-ooccurence

# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals

# Common paths on Surya
ddir = Path("/srv/scratch/avsec/workspace/chipnexus/data/")
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"

mr = ModiscoResult(modisco_dir / 'modisco.h5')
mr.open()

# load scanned instances
dfi = load_instances(modisco_dir / 'instances.parq', dedup=False)
dfi = filter_nonoverlapping_intervals(dfi)

dfi.head()

%matplotlib inline

o = []
for pattern in tqdm(mr.patterns()):
    # get seqlet regions
    dfips = mr.get_seqlet_intervals(pattern, trim_frac=0.08, as_df=True)
    # remove duplicates
    dfips = dfips[~dfips[['chrom', 'start', 'end', 'strand']].duplicated()]

    # get the right pattern
    dfip = dfi[dfi.pattern == pattern][['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand', 'imp_weighted_p', 'match_weighted_p', 'seq_match_p']]
    # remove any remaining duplicates
    dfip = dfip[~dfip[['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()]
    # require a minimal match to be the same as for seqlets
    dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > 0)]

    # merge the two
    common = pd.merge(dfip, dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='inner')
    o.append({"pattern": pattern, "n_seqlets": len(dfips), "n_newly_matched": len(dfip), "match_weighted_p": 0, "seqlets_covered": len(common)})
    
    dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > .01)]
    common = pd.merge(dfip, dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='inner')
    o.append({"pattern": pattern, "n_seqlets": len(dfips), "n_newly_matched": len(dfip), "match_weighted_p": .01, "seqlets_covered": len(common)})
    
    dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > .2)]
    common = pd.merge(dfip, dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='inner')
    o.append({"pattern": pattern, "n_seqlets": len(dfips), "n_newly_matched": len(dfip), "match_weighted_p": .2, "seqlets_covered": len(common)})

100%|██████████| 122/122 [10:21<00:00,  6.70s/it]

df = pd.DataFrame(o)

df = df.set_index("pattern")

df['matched'] = df.seqlets_covered

df['seqlets_covered'] = df.matched / df.n_seqlets

match_weighted_p > 0¶

df[df.match_weighted_p == 0].plot.scatter("n_seqlets", "n_newly_matched")
plt.xscale("log")
plt.yscale("log")
plt.plot([10, 1e5], [10, 1e5]);

df[df.match_weighted_p == 0].plot.scatter("n_seqlets", 'seqlets_covered')
plt.xscale("log")

match_weighted_p > .01¶

df[df.match_weighted_p == 0.01].plot.scatter("n_seqlets", "n_newly_matched")
plt.xscale("log")
plt.yscale("log")
plt.plot([10, 1e5], [10, 1e5]);

df[df.match_weighted_p == 0.01].plot.scatter("n_seqlets", 'seqlets_covered')
plt.xscale("log")

match_weighted_p > .2¶

df[df.match_weighted_p == .2].plot.scatter("n_seqlets", "n_newly_matched")
plt.xscale("log")
plt.yscale("log")
plt.plot([10, 1e5], [10, 1e5]);

df[df.match_weighted_p == .2].plot.scatter("n_seqlets", 'seqlets_covered')
plt.xscale("log")

Profile plots¶

For a motif, check the heatmaps for:

seqlets
motif instances
motif instances not as seqlets

from basepair.cli.imp_score import ImpScoreFile

imp_scores = ImpScoreFile.from_modisco_dir(modisco_dir)
profiles = imp_scores.get_profiles()

pattern = 'metacluster_0/pattern_0'
tfc = 'Oct4'

mr.get_pattern(pattern).plot("seq_ic");

dfips = mr.get_seqlet_intervals(pattern, trim_frac=0.08, as_df=True)
dfips = dfips[~dfips[['chrom', 'start', 'end', 'strand']].duplicated()]
modisco_seqlets = [s for i, s in enumerate(mr._get_seqlets(pattern, trim_frac=0.08)) if i in dfips.index]

dfips.head()

# get the right pattern
dfip = dfi[dfi.pattern == pattern][['pattern', 'example_idx', 'pattern_start', 'pattern_end', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand', 'imp_weighted_p', 'match_weighted_p', 'seq_match_p']]
# remove any remaining duplicates
dfip = dfip[~dfip[['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()]
# require a minimal match to be the same as for seqlets
dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > 0.0)]

# dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > .2)]

from basepair.modisco.pattern_instances import dfi2seqlets, annotate_profile
from basepair.modisco.results import Seqlet, resize_seqlets
from basepair.plot.profiles import extract_signal
from basepair.plot.profiles import  plot_stranded_profile, multiple_plot_stranded_profile
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile

Modisco¶

seqlets = resize_seqlets(modisco_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

Scanned¶

match_weighted_p > .2¶

# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.1]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > .01¶

# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > 0¶

# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

# merge
dfips['seqlet_id'] = np.arange(len(dfips))
dfip_merged = pd.merge(dfip, dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='left')
df_not_found = dfip_merged[dfip_merged.start.isnull()]
df_not_found['pattern'] = df_not_found['pattern_x']

dfip_merged_s = pd.merge(dfip[dfip.match_weighted_p > .01], dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='right')
df_not_found_s = dfip_merged_s[dfip_merged_s.pattern_x.isnull()]
df_not_found_s['pattern'] = df_not_found_s['pattern_x']

not_rediscovered_seqlets = [s for i, s in enumerate(modisco_seqlets) if i in df_not_found_s.seqlet_id]

Not overlapping¶

Seqlets¶

len(not_rediscovered_seqlets)

414

seqlets = resize_seqlets(not_rediscovered_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

Scanned¶

imp_weighted > .2¶

len(df_not_found[df_not_found.match_weighted_p > .2])

6723

seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .2]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > .01¶

len(df_not_found[df_not_found.match_weighted_p > .01])

22289

seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

Sox2¶

pattern = 'metacluster_0/pattern_1'
tfc = 'Sox2'

mr.get_pattern(pattern).plot("seq_ic");

dfips = mr.get_seqlet_intervals(pattern, trim_frac=0.08, as_df=True)
dfips = dfips[~dfips[['chrom', 'start', 'end', 'strand']].duplicated()]
modisco_seqlets = [s for i, s in enumerate(mr._get_seqlets(pattern, trim_frac=0.08)) if i in dfips.index]

dfips.head()

# get the right pattern
dfip = dfi[dfi.pattern == pattern][['pattern', 'example_idx', 'pattern_start', 'pattern_end', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand', 'imp_weighted_p', 'match_weighted_p', 'seq_match_p']]
# remove any remaining duplicates
dfip = dfip[~dfip[['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()]
# require a minimal match to be the same as for seqlets
dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > 0.0)]

# dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > .2)]

from basepair.modisco.pattern_instances import dfi2seqlets, annotate_profile
from basepair.modisco.results import Seqlet, resize_seqlets
from basepair.plot.profiles import extract_signal
from basepair.plot.profiles import  plot_stranded_profile, multiple_plot_stranded_profile
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile

Modisco¶

seqlets = resize_seqlets(modisco_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

Scanned¶

match_weighted_p > .2¶

# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.1]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > .01¶

# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > 0¶

# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

# merge
dfips['seqlet_id'] = np.arange(len(dfips))
dfip_merged = pd.merge(dfip, dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='left')
df_not_found = dfip_merged[dfip_merged.start.isnull()]
df_not_found['pattern'] = df_not_found['pattern_x']

dfip_merged_s = pd.merge(dfip[dfip.match_weighted_p > .01], dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='right')
df_not_found_s = dfip_merged_s[dfip_merged_s.pattern_x.isnull()]
df_not_found_s['pattern'] = df_not_found_s['pattern_x']

not_rediscovered_seqlets = [s for i, s in enumerate(modisco_seqlets) if i in df_not_found_s.seqlet_id]

Not overlapping¶

Seqlets¶

len(not_rediscovered_seqlets)

184

seqlets = resize_seqlets(not_rediscovered_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

Scanned¶

imp_weighted > .2¶

len(df_not_found[df_not_found.match_weighted_p > .2])

8160

seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .2]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > .01¶

len(df_not_found[df_not_found.match_weighted_p > .01])

34064

seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

from basepair.exp.paper.config import motifs

motifs

OrderedDict([('Oct4-Sox2', 'm0_p0'),
             ('Sox2', 'm0_p1'),
             ('Nanog', 'm2_p0'),
             ('Klf4', 'm1_p0')])

Nanog¶

pattern = 'metacluster_2/pattern_0'
tfc = 'Nanog'

mr.get_pattern(pattern).plot("seq_ic");

dfips = mr.get_seqlet_intervals(pattern, trim_frac=0.08, as_df=True)
dfips = dfips[~dfips[['chrom', 'start', 'end', 'strand']].duplicated()]
modisco_seqlets = [s for i, s in enumerate(mr._get_seqlets(pattern, trim_frac=0.08)) if i in dfips.index]

dfips.head()

# get the right pattern
dfip = dfi[dfi.pattern == pattern][['pattern', 'example_idx', 'pattern_start', 'pattern_end', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand', 'imp_weighted_p', 'match_weighted_p', 'seq_match_p']]
# remove any remaining duplicates
dfip = dfip[~dfip[['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()]
# require a minimal match to be the same as for seqlets
dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > 0.0)]

# dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > .2)]

from basepair.modisco.pattern_instances import dfi2seqlets, annotate_profile
from basepair.modisco.results import Seqlet, resize_seqlets
from basepair.plot.profiles import extract_signal
from basepair.plot.profiles import  plot_stranded_profile, multiple_plot_stranded_profile
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile

Modisco¶

seqlets = resize_seqlets(modisco_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

Scanned¶

match_weighted_p > .2¶

# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.1]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > .01¶

# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > 0¶

# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

# merge
dfips['seqlet_id'] = np.arange(len(dfips))
dfip_merged = pd.merge(dfip, dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='left')
df_not_found = dfip_merged[dfip_merged.start.isnull()]
df_not_found['pattern'] = df_not_found['pattern_x']

dfip_merged_s = pd.merge(dfip[dfip.match_weighted_p > .01], dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='right')
df_not_found_s = dfip_merged_s[dfip_merged_s.pattern_x.isnull()]
df_not_found_s['pattern'] = df_not_found_s['pattern_x']

not_rediscovered_seqlets = [s for i, s in enumerate(modisco_seqlets) if i in df_not_found_s.seqlet_id]

Not overlapping¶

Seqlets¶

len(not_rediscovered_seqlets)

795

seqlets = resize_seqlets(not_rediscovered_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

Scanned¶

imp_weighted > .2¶

len(df_not_found[df_not_found.match_weighted_p > .2])

33803

seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .2]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > .01¶

len(df_not_found[df_not_found.match_weighted_p > .01])

104267

seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

Klf4¶

pattern = 'metacluster_1/pattern_0'
tfc = 'Nanog'

mr.get_pattern(pattern).plot("seq_ic");

dfips = mr.get_seqlet_intervals(pattern, trim_frac=0.08, as_df=True)
dfips = dfips[~dfips[['chrom', 'start', 'end', 'strand']].duplicated()]
modisco_seqlets = [s for i, s in enumerate(mr._get_seqlets(pattern, trim_frac=0.08)) if i in dfips.index]

dfips.head()

# get the right pattern
dfip = dfi[dfi.pattern == pattern][['pattern', 'example_idx', 'pattern_start', 'pattern_end', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand', 'imp_weighted_p', 'match_weighted_p', 'seq_match_p']]
# remove any remaining duplicates
dfip = dfip[~dfip[['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()]
# require a minimal match to be the same as for seqlets
dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > 0.0)]

# dfip = dfip[(dfip.imp_weighted_p > 0) & (dfip.match_weighted_p > .2)]

from basepair.modisco.pattern_instances import dfi2seqlets, annotate_profile
from basepair.modisco.results import Seqlet, resize_seqlets
from basepair.plot.profiles import extract_signal
from basepair.plot.profiles import  plot_stranded_profile, multiple_plot_stranded_profile
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile

Modisco¶

seqlets = resize_seqlets(modisco_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

Scanned¶

match_weighted_p > .2¶

# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.1]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > .01¶

# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip[dfip.match_weighted_p > 0.01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > 0¶

# match_weighted > .2
seqlets = resize_seqlets(dfi2seqlets(dfip), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

# merge
dfips['seqlet_id'] = np.arange(len(dfips))
dfip_merged = pd.merge(dfip, dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='left')
df_not_found = dfip_merged[dfip_merged.start.isnull()]
df_not_found['pattern'] = df_not_found['pattern_x']

dfip_merged_s = pd.merge(dfip[dfip.match_weighted_p > .01], dfips, left_on=['example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand'], right_on=['chrom', 'start', 'end', 'strand'], how='right')
df_not_found_s = dfip_merged_s[dfip_merged_s.pattern_x.isnull()]
df_not_found_s['pattern'] = df_not_found_s['pattern_x']

not_rediscovered_seqlets = [s for i, s in enumerate(modisco_seqlets) if i in df_not_found_s.seqlet_id]

Not overlapping¶

Seqlets¶

len(not_rediscovered_seqlets)

2321

seqlets = resize_seqlets(not_rediscovered_seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

Scanned¶

imp_weighted > .2¶

len(df_not_found[df_not_found.match_weighted_p > .2])

18842

seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .2]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

match_weighted_p > .01¶

len(df_not_found[df_not_found.match_weighted_p > .01])

49603

seqlets = resize_seqlets(dfi2seqlets(df_not_found[df_not_found.match_weighted_p > .01]), 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
sort_idx = np.argsort(-seqlet_profiles[tfc].max(axis=(1,2)))
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx[:1000], figsize=(10,10));

	pattern	pattern_start	pattern_end	strand	pattern_len	pattern_center	match_weighted	match_weighted_p	match_weighted_cat	match_max	match_max_task	imp_weighted	imp_weighted_p	imp_weighted_cat	imp_max	imp_max_task	seq_match	seq_match_p	seq_match_cat	match/Klf4	match/Nanog	match/Oct4	match/Sox2	imp/Klf4	imp/Nanog	imp/Oct4	imp/Sox2	example_chrom	example_start	example_end	example_strand	example_interval_from_task	pattern_short	pattern_start_abs	pattern_end_abs
0	metacluster_0/pattern_0	104	119	+	15	111	0.2469	0.0010	low	0.4484	Oct4	0.0365	NaN	NaN	0.0524	Oct4	3.9825	0.0085	low	0.2654	-0.3593	0.4484	0.3725	0.0235	0.0246	0.0524	0.0313	chrX	143482544	143483544	*	Oct4	m0_p0	143482648	143482663
1	metacluster_0/pattern_0	263	278	-	15	270	0.2518	0.0011	low	0.2893	Oct4	0.0217	NaN	NaN	0.0342	Nanog	1.1066	0.0007	low	0.1618	0.2170	0.2893	0.2803	0.0143	0.0342	0.0200	0.0203	chrX	143482544	143483544	*	Oct4	m0_p0	143482807	143482822
2	metacluster_0/pattern_0	282	297	+	15	289	0.3466	0.0069	low	0.3870	Sox2	0.0272	NaN	NaN	0.0296	Sox2	5.0100	0.0188	low	0.2257	0.3454	0.3723	0.3870	0.0233	0.0258	0.0279	0.0296	chrX	143482544	143483544	*	Oct4	m0_p0	143482826	143482841
3	metacluster_0/pattern_0	445	460	+	15	452	0.2399	0.0008	low	0.2469	Sox2	0.1852	0.0008	high	0.4145	Nanog	1.7775	0.0012	low	0.2458	0.2408	0.2314	0.2469	0.1234	0.4145	0.0949	0.1896	chrX	143482544	143483544	*	Oct4	m0_p0	143482989	143483004
4	metacluster_0/pattern_0	475	490	-	15	482	0.2903	0.0025	low	0.3322	Oct4	0.2889	0.0179	high	0.5196	Nanog	3.3390	0.0057	low	0.2092	0.2681	0.3322	0.2991	0.1613	0.5196	0.2049	0.3241	chrX	143482544	143483544	*	Oct4	m0_p0	143483019	143483034

	chrom	start	end	interval_from_task	seqname	strand	pattern
0	chr17	35504049	35504064	Oct4	2	+	metacluster_0/pattern_0
4	chr5	75085360	75085375	Sox2	31157	+	metacluster_0/pattern_0
5	chr8	113598589	113598604	Sox2	29329	+	metacluster_0/pattern_0
6	chr5	131526931	131526946	Oct4	164	+	metacluster_0/pattern_0
12	chrX	61393375	61393390	Sox2	23380	+	metacluster_0/pattern_0

	chrom	start	end	interval_from_task	seqname	strand	pattern
0	chr17	29187380	29187402	Nanog	33172	+	metacluster_0/pattern_1
4	chr12	56268639	56268661	Nanog	36552	+	metacluster_0/pattern_1
5	chr13	14290196	14290218	Oct4	5238	+	metacluster_0/pattern_1
6	chr6	99310277	99310299	Nanog	38522	+	metacluster_0/pattern_1
12	chr19	55418947	55418969	Klf4	78746	+	metacluster_0/pattern_1

	chrom	start	end	interval_from_task	seqname	strand	pattern
0	chr7	78183558	78183567	Nanog	41261	-	metacluster_2/pattern_0
1	chr2	17936888	17936897	Klf4	75670	-	metacluster_2/pattern_0
2	chr3	122492241	122492250	Oct4	4242	-	metacluster_2/pattern_0
3	chr4	132018778	132018787	Oct4	8779	-	metacluster_2/pattern_0
4	chr9	6409208	6409217	Nanog	41511	-	metacluster_2/pattern_0

	chrom	start	end	interval_from_task	seqname	strand	pattern
0	chr11	35838497	35838507	Klf4	60114	+	metacluster_1/pattern_0
1	chr11	88758531	88758541	Oct4	16636	-	metacluster_1/pattern_0
4	chr4	34881427	34881437	Klf4	53871	+	metacluster_1/pattern_0
5	chr14	75845032	75845042	Klf4	60079	-	metacluster_1/pattern_0
6	chr19	17346877	17346887	Klf4	53173	-	metacluster_1/pattern_0