Requires

  • {output_dir}/footprints.pkl
  • {modisco_dir}/modisco.h5
  • {modisco_dir}/instances.parq
  • {modisco_dir}/pattern_table.csv

Produces

  • {output_dir}/patterns.pkl
In [2]:
from uuid import uuid4
from collections import OrderedDict
from basepair.math import mean
from basepair.stats import perc

from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable

from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_short
from basepair.imports import *


model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"
output_dir = Path("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile")
In [3]:
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()

patterns = [mr.get_pattern(p) for p in mr.patterns()]

tasks = [x.split("/")[0] for x in mr.tasks()]

Add profile to each pattern

In [4]:
# read the property table
pattern_table = pd.read_csv(output_dir / "pattern_table.csv")
# read the footprints
footprints = read_pkl(output_dir / 'footprints.pkl')
In [5]:
pattern_table.head()
Out[5]:
idx pattern n seqlets ic pwm mean Klf4 imp profile Klf4 imp counts Klf4 footprint entropydiff Klf4 footprint max Klf4 footprint strandcor Klf4 footprint counts Klf4 region counts Klf4 pos meandiff Klf4 pos std Klf4 pos unimodal Klf4 periodicity 10bp Nanog imp profile Nanog imp counts Nanog footprint entropydiff Nanog footprint max Nanog footprint strandcor Nanog footprint counts Nanog region counts Nanog pos meandiff Nanog pos std Nanog pos unimodal Nanog periodicity 10bp Oct4 imp profile Oct4 imp counts Oct4 footprint entropydiff Oct4 footprint max Oct4 footprint strandcor Oct4 footprint counts Oct4 region counts Oct4 pos meandiff Oct4 pos std Oct4 pos unimodal Oct4 periodicity 10bp Sox2 imp profile Sox2 imp counts Sox2 footprint entropydiff Sox2 footprint max Sox2 footprint strandcor Sox2 footprint counts Sox2 region counts Sox2 pos meandiff Sox2 pos std Sox2 pos unimodal Sox2 periodicity 10bp consensus
0 0 m0_p0 13278 0.9819 0.0209 0.0111 0.0540 0.5510 0.0052 116.8492 263.2435 2.0350 87.2859 True 0.0177 0.0258 0.0258 0.0913 1.4743 0.0054 289.9074 527.6562 2.0068 68.7499 True 0.0119 0.0445 0.0384 0.1820 5.1700 0.0061 401.0843 756.3519 1.9982 43.9320 True 0.0332 0.0343 0.0238 0.2260 1.9023 0.0064 135.1212 282.1146 2.0135 59.5346 True 0.0245 TTTGCATAACAAAGG
1 1 m0_p1 1862 0.6376 0.0128 0.0068 0.1565 1.2223 0.0055 200.3641 352.8851 2.0062 36.4735 True 0.0310 0.0257 0.0159 0.2929 4.7277 0.0061 516.2857 770.6777 2.0232 39.6288 True 0.0264 0.0080 0.0104 0.0413 1.0024 0.0051 216.5016 470.1912 2.0263 43.8143 True 0.0474 0.0204 0.0101 0.3281 3.0287 0.0070 188.5086 335.6944 2.0041 40.9617 True 0.0366 AGATGACTCCATTGTTCTGCCA
2 2 m0_p2 1647 0.5100 0.0107 0.0051 0.1392 0.8716 0.0056 155.8215 293.5659 1.9943 34.1107 True 0.0228 0.0401 0.0143 0.5313 10.3585 0.0076 821.9380 1115.3187 2.0023 30.0836 True 0.0225 0.0080 0.0080 0.0690 1.2456 0.0052 239.1451 496.5714 2.0082 34.1424 True 0.0305 0.0096 0.0083 0.1392 0.9265 0.0055 139.6715 275.4329 2.0110 37.4533 True 0.0160 GAGCCATCAAGATCCAA
3 3 m0_p3 847 0.7699 0.0073 0.0041 0.1861 0.9858 0.0051 125.3058 260.8737 2.0206 65.1018 True 0.0290 0.0160 0.0088 0.4463 4.8253 0.0065 293.3069 505.6801 2.0150 49.8275 True 0.0196 0.0104 0.0082 0.2604 3.0266 0.0058 365.0094 673.6305 2.0300 56.5630 True 0.0427 0.0084 0.0065 0.3563 2.0094 0.0066 118.2184 237.6859 2.0166 50.2757 True 0.0276 TTTGGGGAAGTCCAGGTACTC...
4 4 m0_p4 669 0.9353 0.0204 0.0102 0.0970 0.7123 0.0053 149.5426 300.6069 1.9887 49.1740 True 0.0046 0.0135 0.0044 0.0841 1.2474 0.0052 262.8116 467.8983 2.0212 49.3346 True 0.0080 0.0083 0.0062 0.0797 1.4230 0.0050 233.1659 504.0120 2.0308 50.7619 True 0.0043 0.0073 0.0054 0.0830 0.4821 0.0050 85.4783 212.9791 2.0498 66.6752 True 0.0053 AAGTTCAAGGTCAGCCT
In [6]:
patterns = [p.add_profile(footprints[p.name]) for p in patterns]

Add the fraction of other motifs

(Motif heterogeneity, co-occurrence)

In [7]:
from basepair.modisco.motif_clustering import to_colors, preproc_motif_table, motif_table_long, scale, preproc_df
In [8]:
from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals, plot_coocurence_matrix
In [9]:
# TODO - speedup using IntervalIndex .. .loc[motif_center...]
In [10]:
# specify motifs to use in the analysis
motifs = OrderedDict([
    ("Oct4-Sox2", "m0_p0"),
    ("Oct4-Sox2-deg", "m6_p8"),
    ("Oct4", "m0_p18"),
    ("Sox2", "m0_p1"),
    ("Essrb", "m0_p2"),
    ("Nanog", "m0_p3"),
    ("Nanog-periodic", "m0_p9"),
    ("Klf4", "m2_p0"),
])
In [11]:
dfi = load_instances(f"{modisco_dir}/instances.parq", motifs)

dfi = filter_nonoverlapping_intervals(dfi)

total_examples = len(dfi.example_idx.unique())
total_examples
number of deduplicatd instances: 2539231 (34.8230835606662%)
Out[11]:
61282
In [12]:
# Read seqlets from modisco
df_seqlets = mr.seqlet_df_instances(trim_frac=0.08)
df_seqlets = df_seqlets.rename(columns=dict(seqname="example_idx", start='seqlet_start', end='seqlet_end', strand='seqlet_strand', center='seqlet_center'))
df_seqlets['pattern'] = df_seqlets['pattern'].map({k: shorten_pattern(k) for k in df_seqlets.pattern.unique()})
del df_seqlets['name']
TF-MoDISco is using the TensorFlow backend.
In [ ]:
def non_overlapping(x, df_seqlets):
    """
    Args:
      x: scanned motif instances in the region for a particular pattern in the particular region
      df_seqlets: pd.DataFrame of Seqlets. 
        Columns: example_idx, pattern, seqlet_{start, end, strand, center}
    """
    if len(x) == 0:
        return x
    
    # for each motif instance in the region, find the same instances of the pattern
    dfs = df_seqlets[(df_seqlets.pattern == x['pattern_short'].iloc[0]) & (df_seqlets.example_idx == x['example_idx'].iloc[0])]
    if len(dfs)==0:
        return x
    
    dist = np.abs(x['pattern_center'].values.reshape((-1,1)) - dfs.seqlet_center.values.reshape([1,-1])).min(1)
    return x[dist > 15]

TODO

  • setup counting
    • for each seqlet, count the environment
  • index by:
    • example_idx, pattern type

Notes

  • are we indeed seqlet-centered?
  • can you also max-restrict it?
In [18]:
df_seqlets.head()
Out[18]:
example_idx seqlet_start seqlet_end seqlet_strand pattern seqlet_center
0 10828 468 484 - m0_p0 476.0
1 28550 494 510 - m0_p0 502.0
2 9201 447 463 + m0_p0 455.0
3 4631 497 513 + m0_p0 505.0
4 23162 528 544 - m0_p0 536.0
In [ ]:
dfi_subset = dfi.query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"')

# get rid of overlapping intervals
dfi_subset_subset = dfi_subset.groupby(['pattern_short', 'example_idx']).apply(lambda x: non_overlapping(x, df_seqlets))

counts = pd.pivot_table(dfi_subset_subset, 'pattern_len', "example_idx", "pattern_name", aggfunc=len, fill_value=0)
c = counts >0  # True or False per interval per pattern

df_seqlets_c = c.merge(df_seqlets, on='example_idx')
In [19]:
df_seqlets_c.head(2)
Out[19]:
example_idx Essrb Klf4 Nanog Nanog-periodic Oct4 Oct4-Sox2 Oct4-Sox2-deg Sox2 seqlet_start seqlet_end seqlet_strand pattern seqlet_center
0 2 False True True True False False True False 497 513 + m0_p0 505.0
1 3 False False False False False False True False 499 515 - m0_p0 507.0
In [20]:
df_seqlets_c_agg = df_seqlets_c.groupby(['pattern'])[list(c)].mean()
df_seqlets_c_agg_size = df_seqlets_c.groupby(['pattern']).size()
df_seqlets_c_agg = pd.concat([df_seqlets_c_agg, df_seqlets_c_agg_size], axis=1).reset_index().set_index('pattern').rename(columns={0: 'n_seqlets'})

# sort using the classical names
df_seqlets_c_agg = pd.concat([df_seqlets_c_agg, pd.DataFrame(df_seqlets_c_agg.index.to_series().apply(extract_name_short).tolist(), index=df_seqlets_c_agg.index)], axis=1)
df_seqlets_c_agg = df_seqlets_c_agg.sort_values(['metacluster', 'pattern'])
del df_seqlets_c_agg['metacluster']
del df_seqlets_c_agg['pattern']


col_anno = df_seqlets_c_agg[['n_seqlets']]
del df_seqlets_c_agg['n_seqlets']
In [ ]:
# Add other motifs
In [21]:
patterns = [p.add_attr('motif_freq', dict(df_seqlets_c_agg.loc[shorten_pattern(p.name)])) for p in patterns]

Add pattern features

In [12]:
# patterns = read_pkl(output_dir / 'patterns.pkl')
In [25]:
patterns = [p.add_attr('features', OrderedDict(pattern_table[pattern_table.pattern == shorten_pattern(p.name)].iloc[0])) for p in patterns]

# check that the pattern names match
assert patterns[4].attrs['features']['pattern'] == shorten_pattern(patterns[4].name)
In [26]:
write_pkl(patterns, output_dir / 'patterns.pkl')