Goals

  • plot some heatmaps for seqlets inferred by find_instances

Conclusions

  • instance finding code is able to get the same patterns/profiles genome-wide for the 3 major motifs
  • percnormed_score > 0.7 is a good cutoff yielding ~ 5x more seqlets genome-wide
In [177]:
# common imports
from basepair.imports import *
from basepair.config import test_chr, valid_chr

Load the data

In [2]:
pattern1 = "metacluster_0/pattern_0"
pattern2 = "metacluster_0/pattern_1"

Modisco

In [3]:
ddir = Path(get_data_dir())
model_dir = ddir / "processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/"
modisco_dir = model_dir / "modisco/valid"
In [4]:
# Modisco
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
tasks = mr.tasks()
In [5]:
ds = DataSpec.load(model_dir / "dataspec.yaml")

d = HDF5Reader.load(model_dir / "grad.all.h5")
In [9]:
id_hash = pd.DataFrame({"peak_id": d['metadata']['interval_from_task'], 
                        "example_idx": np.arange(d['metadata']['interval_from_task'].shape[0])})

Labelled instances

In [10]:
# load the instances data frame
df = pd.read_csv(modisco_dir / "all.instances.tsv", sep='\t')
df['pattern_id'] = "metacluster_" + df.metacluster.astype(str) + "/pattern_" + df.pattern.astype(str)
df['seqlet_center'] = (df.seqlet_start + df.seqlet_end) / 2
patterns = df.pattern_id.unique().tolist()
pattern_pssms = {pattern: mr.get_pssm(*pattern.split("/")) for pattern in patterns}
append_pattern_loc(df, pattern_pssms, trim_frac=0.08)
df['pattern_center'] = (df.pattern_start + df.pattern_end) / 2
df = df.merge(id_hash, on="example_idx")

Compute the optimal threshold

In [153]:
from pysam import FastaFile
In [223]:
fa = FastaFile(ds.fasta_file)

chrl = pd.DataFrame([(c,l) for c,l in zip(fa.references, fa.lengths) if "_" not in c], columns=['chr', 'l'])

mfrac = chrl.l.sum() / chrl[chrl.chr.isin(valid_chr)].l.sum()
mfrac
Out[223]:
5.4657122969026135
In [193]:
dfn = pd.DataFrame([dict(pattern_id=p, old=mr.n_seqlets(*p.split("/"))) for p in mr.patterns()])
In [292]:
dfm = df[df.percnormed_score > 0.7].groupby("pattern_id").size().reset_index().merge(dfn, on='pattern_id').rename(columns={0:"new"})
In [293]:
import seaborn as sns
import statsmodels.api as sm
In [294]:
model = sm.OLS(dfm.new, dfm.old)
results = model.fit()
results.conf_int()  # confidence interval for beta
Out[294]:
0 1
old 9.039921 11.396031
In [295]:
dfm.new.sum() / dfm.old.sum()
Out[295]:
10.810297691373025
In [296]:
sns.jointplot("old", "new", data=dfm, kind="reg")
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "
Out[296]:
<seaborn.axisgrid.JointGrid at 0x7f84f991f668>

Functions

In [90]:
from basepair.plot.vdom import get_signal
from basepair.modisco.utils import ic_scale
from concise.utils.plot import seqlogo_fig, seqlogo
from basepair.plot.profiles import extract_signal
In [81]:
def df2seqlets(df, width=None):
    """Convert the pd.DataFrame of instances 
    """
    return [Seqlet(row.example_idx, 
                   row.pattern_start, 
                   row.pattern_end, 
                   name=row.pattern_id,
                   strand="-" if row.revcomp else "+").resize(width)
           for i, row in df.iterrows()]
In [118]:
from basepair.plot.vdom import fig2vdom, vdm_heatmaps

included_samples = np.ones(d['inputs'].shape[:1], dtype=bool)

tasks = mr.tasks()

top_n = 1000

Metacluster_0/pattern_1

percnormed_score in [0.7, 1]

In [269]:
pattern = 'metacluster_0/pattern_1'
In [270]:
dff = df[(df.pattern_id == pattern) & \
        (0.7 < df.percnormed_score) & \
        (df.percnormed_score < 1)
        ]
In [271]:
# Number of seqlets
len(dff)
Out[271]:
10554
In [272]:
seqlets = df2seqlets(dff, width=40)  # randomly sample 10% of the seqlets to speedup plotting
In [273]:
display(fig2vdom(mr.plot_pssm(*pattern.split("/"))))  # original
display(fig2vdom(seqlogo_fig(ic_scale(extract_signal(d['inputs'][included_samples], seqlets).mean(axis=0)))));  # inferred
vdm_heatmaps(seqlets, d, included_samples, tasks, pattern, 
             pssm_fig=p(), top_n=top_n, opened=False)
Out[273]:
Sequence:



ChIP-nexus counts:
Importance scores (profile)
Importance scores (counts)

percnormed_score in [0.5, 1]

In [297]:
pattern = 'metacluster_0/pattern_1'
In [298]:
dff = df[(df.pattern_id == pattern) & \
        (0.5 < df.percnormed_score) & \
        (df.percnormed_score < 1)
        ]
In [299]:
# Number of seqlets
len(dff)
Out[299]:
20203
In [300]:
seqlets = df2seqlets(dff, width=40)  # randomly sample 10% of the seqlets to speedup plotting
In [301]:
display(fig2vdom(mr.plot_pssm(*pattern.split("/"))))  # original
display(fig2vdom(seqlogo_fig(ic_scale(extract_signal(d['inputs'][included_samples], seqlets).mean(axis=0)))));  # inferred
vdm_heatmaps(seqlets, d, included_samples, tasks, pattern, 
             pssm_fig=p(), top_n=top_n, opened=False)
Out[301]:
Sequence:



ChIP-nexus counts:
Importance scores (profile)
Importance scores (counts)

Metacluster_0/pattern_1

percnormed_score in [0.2, 3]

In [129]:
dff = df[(df.pattern_id == pattern) & \
        (0.2 < df.percnormed_score) & \
        (df.percnormed_score < 0.3)
        ]
In [130]:
# Number of seqlets
len(dff)
Out[130]:
4802
In [131]:
seqlets = df2seqlets(dff, 40)
In [132]:
display(fig2vdom(mr.plot_pssm(*pattern.split("/"))))  # original
display(fig2vdom(seqlogo_fig(ic_scale(extract_signal(d['inputs'][included_samples], seqlets).mean(axis=0)))));  # inferred
vdm_heatmaps(seqlets, d, included_samples, tasks, pattern, 
             pssm_fig=p(), top_n=top_n, opened=False)
Out[132]:
Sequence:



ChIP-nexus counts:
Importance scores (profile)
Importance scores (counts)

Metacluster_0/pattern_0

percnormed_score in [0.7, 1]

In [316]:
pattern = 'metacluster_0/pattern_0'
In [283]:
dff = df[(df.pattern_id == pattern) & \
         (100 < df.pattern_start) & \
         (900 > df.pattern_end) & \
        (0.7 < df.percnormed_score) & \
        (df.percnormed_score < 1)
        ]
In [284]:
# Number of seqlets
len(dff)
Out[284]:
14937
In [285]:
seqlets = df2seqlets(dff, 40)
In [286]:
display(fig2vdom(mr.plot_pssm(*pattern.split("/"))))  # original
display(fig2vdom(seqlogo_fig(ic_scale(extract_signal(d['inputs'][included_samples], seqlets).mean(axis=0)))));  # inferred
vdm_heatmaps(seqlets, d, included_samples, tasks, pattern, 
             pssm_fig=p(), top_n=top_n, opened=False)
Out[286]:
Sequence:



ChIP-nexus counts:
Importance scores (profile)
Importance scores (counts)

percnormed_score in [0.5, 1]

In [317]:
dff = df[(df.pattern_id == pattern) & \
        (100 < df.pattern_start) & \
        (900 > df.pattern_end) & \
        (0.5 < df.percnormed_score) & \
        (df.percnormed_score < 1)
        ]
In [318]:
# Number of seqlets
len(dff)
Out[318]:
24695
In [319]:
seqlets = df2seqlets(dff, 40)
In [320]:
display(fig2vdom(mr.plot_pssm(*pattern.split("/"))))  # original
display(fig2vdom(seqlogo_fig(ic_scale(extract_signal(d['inputs'][included_samples], seqlets).mean(axis=0)))));  # inferred
vdm_heatmaps(seqlets, d, included_samples, tasks, pattern, 
             pssm_fig=p(), top_n=top_n, opened=False)
Out[320]:
Sequence:



ChIP-nexus counts:
Importance scores (profile)
Importance scores (counts)

percnormed_score in [0.2, 3]

In [138]:
dff = df[(df.pattern_id == pattern) & \
        (100 < df.pattern_start) & \
        (900 > df.pattern_end) & \
        (0.2 < df.percnormed_score) & \
        (df.percnormed_score < 0.3)
        ]
In [139]:
# Number of seqlets
len(dff)
Out[139]:
5049
In [140]:
seqlets = df2seqlets(dff, 40)
In [141]:
display(fig2vdom(mr.plot_pssm(*pattern.split("/"))))  # original
display(fig2vdom(seqlogo_fig(ic_scale(extract_signal(d['inputs'][included_samples], seqlets).mean(axis=0)))));  # inferred
vdm_heatmaps(seqlets, d, included_samples, tasks, pattern, 
             pssm_fig=p(), top_n=top_n, opened=False)
Out[141]:
Sequence:



ChIP-nexus counts:
Importance scores (profile)
Importance scores (counts)

Metacluster_0/pattern_9

percnormed_score in [0.7, 1]

In [311]:
pattern = 'metacluster_0/pattern_9'
In [288]:
dff = df[(df.pattern_id == pattern) & \
        (100 < df.pattern_start) & \
        (900 > df.pattern_end) & \
        (0.7 < df.percnormed_score) & \
        (df.percnormed_score < 1)
        ]
In [289]:
# Number of seqlets
len(dff)
Out[289]:
499
In [290]:
seqlets = df2seqlets(dff, 40)
In [291]:
display(fig2vdom(mr.plot_pssm(*pattern.split("/"))))  # original
display(fig2vdom(seqlogo_fig(ic_scale(extract_signal(d['inputs'][included_samples], seqlets).mean(axis=0)))));  # inferred
vdm_heatmaps(seqlets, d, included_samples, tasks, pattern, 
             pssm_fig=p(), top_n=top_n, opened=False)
Out[291]:
Sequence:



ChIP-nexus counts:
Importance scores (profile)
Importance scores (counts)

percnormed_score in [0.5, 1]

In [312]:
dff = df[(df.pattern_id == pattern) & \
        (100 < df.pattern_start) & \
        (900 > df.pattern_end) & \
        (0.5 < df.percnormed_score) & \
        (df.percnormed_score < 1)
        ]
In [313]:
# Number of seqlets
len(dff)
Out[313]:
961
In [314]:
seqlets = df2seqlets(dff, 40)
In [315]:
display(fig2vdom(mr.plot_pssm(*pattern.split("/"))))  # original
display(fig2vdom(seqlogo_fig(ic_scale(extract_signal(d['inputs'][included_samples], seqlets).mean(axis=0)))));  # inferred
vdm_heatmaps(seqlets, d, included_samples, tasks, pattern, 
             pssm_fig=p(), top_n=top_n, opened=False)
Out[315]:
Sequence:



ChIP-nexus counts:
Importance scores (profile)
Importance scores (counts)

percnormed_score in [0.2, 3]

In [147]:
dff = df[(df.pattern_id == pattern) & \
        (0.2 < df.percnormed_score) & \
        (df.percnormed_score < 0.3)
        ]
In [148]:
# Number of seqlets
len(dff)
Out[148]:
219
In [149]:
seqlets = df2seqlets(dff, 40)
In [150]:
display(fig2vdom(mr.plot_pssm(*pattern.split("/"))))  # original
display(fig2vdom(seqlogo_fig(ic_scale(extract_signal(d['inputs'][included_samples], seqlets).mean(axis=0)))));  # inferred
vdm_heatmaps(seqlets, d, included_samples, tasks, pattern, 
             pssm_fig=p(), top_n=top_n, opened=False)
Out[150]:
Sequence:



ChIP-nexus counts:
Importance scores (profile)
Importance scores (counts)