Requires¶

{output_dir}/footprints.pkl
{modisco_dir}/modisco.h5
{modisco_dir}/instances.parq
{modisco_dir}/pattern_table.csv

Produces¶

{output_dir}/patterns.pkl

from uuid import uuid4
from collections import OrderedDict
from basepair.math import mean
from basepair.stats import perc

from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable

from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_short
from basepair.imports import *


model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"
output_dir = Path("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile")

mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()

patterns = [mr.get_pattern(p) for p in mr.patterns()]

tasks = [x.split("/")[0] for x in mr.tasks()]

Add profile to each pattern¶

# read the property table
pattern_table = pd.read_csv(output_dir / "pattern_table.csv")
# read the footprints
footprints = read_pkl(output_dir / 'footprints.pkl')

pattern_table.head()

patterns = [p.add_profile(footprints[p.name]) for p in patterns]

Add the fraction of other motifs¶

(Motif heterogeneity, co-occurrence)

from basepair.modisco.motif_clustering import to_colors, preproc_motif_table, motif_table_long, scale, preproc_df

from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals, plot_coocurence_matrix

# TODO - speedup using IntervalIndex .. .loc[motif_center...]

# specify motifs to use in the analysis
motifs = OrderedDict([
    ("Oct4-Sox2", "m0_p0"),
    ("Oct4-Sox2-deg", "m6_p8"),
    ("Oct4", "m0_p18"),
    ("Sox2", "m0_p1"),
    ("Essrb", "m0_p2"),
    ("Nanog", "m0_p3"),
    ("Nanog-periodic", "m0_p9"),
    ("Klf4", "m2_p0"),
])

dfi = load_instances(f"{modisco_dir}/instances.parq", motifs)

dfi = filter_nonoverlapping_intervals(dfi)

total_examples = len(dfi.example_idx.unique())
total_examples

number of deduplicatd instances: 2539231 (34.8230835606662%)

61282

# Read seqlets from modisco
df_seqlets = mr.seqlet_df_instances(trim_frac=0.08)
df_seqlets = df_seqlets.rename(columns=dict(seqname="example_idx", start='seqlet_start', end='seqlet_end', strand='seqlet_strand', center='seqlet_center'))
df_seqlets['pattern'] = df_seqlets['pattern'].map({k: shorten_pattern(k) for k in df_seqlets.pattern.unique()})
del df_seqlets['name']

TF-MoDISco is using the TensorFlow backend.

def non_overlapping(x, df_seqlets):
    """
    Args:
      x: scanned motif instances in the region for a particular pattern in the particular region
      df_seqlets: pd.DataFrame of Seqlets. 
        Columns: example_idx, pattern, seqlet_{start, end, strand, center}
    """
    if len(x) == 0:
        return x
    
    # for each motif instance in the region, find the same instances of the pattern
    dfs = df_seqlets[(df_seqlets.pattern == x['pattern_short'].iloc[0]) & (df_seqlets.example_idx == x['example_idx'].iloc[0])]
    if len(dfs)==0:
        return x
    
    dist = np.abs(x['pattern_center'].values.reshape((-1,1)) - dfs.seqlet_center.values.reshape([1,-1])).min(1)
    return x[dist > 15]

TODO¶

setup counting
- for each seqlet, count the environment
index by:
- example_idx, pattern type

Notes¶

are we indeed seqlet-centered?
can you also max-restrict it?

df_seqlets.head()

dfi_subset = dfi.query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"')

# get rid of overlapping intervals
dfi_subset_subset = dfi_subset.groupby(['pattern_short', 'example_idx']).apply(lambda x: non_overlapping(x, df_seqlets))

counts = pd.pivot_table(dfi_subset_subset, 'pattern_len', "example_idx", "pattern_name", aggfunc=len, fill_value=0)
c = counts >0  # True or False per interval per pattern

df_seqlets_c = c.merge(df_seqlets, on='example_idx')

df_seqlets_c.head(2)

df_seqlets_c_agg = df_seqlets_c.groupby(['pattern'])[list(c)].mean()
df_seqlets_c_agg_size = df_seqlets_c.groupby(['pattern']).size()
df_seqlets_c_agg = pd.concat([df_seqlets_c_agg, df_seqlets_c_agg_size], axis=1).reset_index().set_index('pattern').rename(columns={0: 'n_seqlets'})

# sort using the classical names
df_seqlets_c_agg = pd.concat([df_seqlets_c_agg, pd.DataFrame(df_seqlets_c_agg.index.to_series().apply(extract_name_short).tolist(), index=df_seqlets_c_agg.index)], axis=1)
df_seqlets_c_agg = df_seqlets_c_agg.sort_values(['metacluster', 'pattern'])
del df_seqlets_c_agg['metacluster']
del df_seqlets_c_agg['pattern']


col_anno = df_seqlets_c_agg[['n_seqlets']]
del df_seqlets_c_agg['n_seqlets']

# Add other motifs

patterns = [p.add_attr('motif_freq', dict(df_seqlets_c_agg.loc[shorten_pattern(p.name)])) for p in patterns]

Add pattern features¶

# patterns = read_pkl(output_dir / 'patterns.pkl')

patterns = [p.add_attr('features', OrderedDict(pattern_table[pattern_table.pattern == shorten_pattern(p.name)].iloc[0])) for p in patterns]

# check that the pattern names match
assert patterns[4].attrs['features']['pattern'] == shorten_pattern(patterns[4].name)

write_pkl(patterns, output_dir / 'patterns.pkl')

	idx	pattern	n seqlets	ic pwm mean	Klf4 imp profile	Klf4 imp counts	Klf4 footprint entropydiff	Klf4 footprint max	Klf4 footprint strandcor	Klf4 footprint counts	Klf4 region counts	Klf4 pos meandiff	Klf4 pos std	Klf4 pos unimodal	Klf4 periodicity 10bp	Nanog imp profile	Nanog imp counts	Nanog footprint entropydiff	Nanog footprint max	Nanog footprint strandcor	Nanog footprint counts	Nanog region counts	Nanog pos meandiff	Nanog pos std	Nanog pos unimodal	Nanog periodicity 10bp	Oct4 imp profile	Oct4 imp counts	Oct4 footprint entropydiff	Oct4 footprint max	Oct4 footprint strandcor	Oct4 footprint counts	Oct4 region counts	Oct4 pos meandiff	Oct4 pos std	Oct4 pos unimodal	Oct4 periodicity 10bp	Sox2 imp profile	Sox2 imp counts	Sox2 footprint entropydiff	Sox2 footprint max	Sox2 footprint strandcor	Sox2 footprint counts	Sox2 region counts	Sox2 pos meandiff	Sox2 pos std	Sox2 pos unimodal	Sox2 periodicity 10bp	consensus
0	0	m0_p0	13278	0.9819	0.0209	0.0111	0.0540	0.5510	0.0052	116.8492	263.2435	2.0350	87.2859	True	0.0177	0.0258	0.0258	0.0913	1.4743	0.0054	289.9074	527.6562	2.0068	68.7499	True	0.0119	0.0445	0.0384	0.1820	5.1700	0.0061	401.0843	756.3519	1.9982	43.9320	True	0.0332	0.0343	0.0238	0.2260	1.9023	0.0064	135.1212	282.1146	2.0135	59.5346	True	0.0245	TTTGCATAACAAAGG
1	1	m0_p1	1862	0.6376	0.0128	0.0068	0.1565	1.2223	0.0055	200.3641	352.8851	2.0062	36.4735	True	0.0310	0.0257	0.0159	0.2929	4.7277	0.0061	516.2857	770.6777	2.0232	39.6288	True	0.0264	0.0080	0.0104	0.0413	1.0024	0.0051	216.5016	470.1912	2.0263	43.8143	True	0.0474	0.0204	0.0101	0.3281	3.0287	0.0070	188.5086	335.6944	2.0041	40.9617	True	0.0366	AGATGACTCCATTGTTCTGCCA
2	2	m0_p2	1647	0.5100	0.0107	0.0051	0.1392	0.8716	0.0056	155.8215	293.5659	1.9943	34.1107	True	0.0228	0.0401	0.0143	0.5313	10.3585	0.0076	821.9380	1115.3187	2.0023	30.0836	True	0.0225	0.0080	0.0080	0.0690	1.2456	0.0052	239.1451	496.5714	2.0082	34.1424	True	0.0305	0.0096	0.0083	0.1392	0.9265	0.0055	139.6715	275.4329	2.0110	37.4533	True	0.0160	GAGCCATCAAGATCCAA
3	3	m0_p3	847	0.7699	0.0073	0.0041	0.1861	0.9858	0.0051	125.3058	260.8737	2.0206	65.1018	True	0.0290	0.0160	0.0088	0.4463	4.8253	0.0065	293.3069	505.6801	2.0150	49.8275	True	0.0196	0.0104	0.0082	0.2604	3.0266	0.0058	365.0094	673.6305	2.0300	56.5630	True	0.0427	0.0084	0.0065	0.3563	2.0094	0.0066	118.2184	237.6859	2.0166	50.2757	True	0.0276	TTTGGGGAAGTCCAGGTACTC...
4	4	m0_p4	669	0.9353	0.0204	0.0102	0.0970	0.7123	0.0053	149.5426	300.6069	1.9887	49.1740	True	0.0046	0.0135	0.0044	0.0841	1.2474	0.0052	262.8116	467.8983	2.0212	49.3346	True	0.0080	0.0083	0.0062	0.0797	1.4230	0.0050	233.1659	504.0120	2.0308	50.7619	True	0.0043	0.0073	0.0054	0.0830	0.4821	0.0050	85.4783	212.9791	2.0498	66.6752	True	0.0053	AAGTTCAAGGTCAGCCT

	example_idx	seqlet_start	seqlet_end	seqlet_strand	pattern	seqlet_center
0	10828	468	484	-	m0_p0	476.0
1	28550	494	510	-	m0_p0	502.0
2	9201	447	463	+	m0_p0	455.0
3	4631	497	513	+	m0_p0	505.0
4	23162	528	544	-	m0_p0	536.0

	example_idx	Essrb	Klf4	Nanog	Nanog-periodic	Oct4	Oct4-Sox2	Oct4-Sox2-deg	Sox2	seqlet_start	seqlet_end	seqlet_strand	pattern	seqlet_center
0	2	False	True	True	True	False	False	True	False	497	513	+	m0_p0	505.0
1	3	False	False	False	False	False	False	True	False	499	515	-	m0_p0	507.0