In [4]:
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'

Goal

  • plot the per-tf motifs as a table
In [5]:
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
from basepair.plot.profiles import multiple_plot_stranded_profile
from basepair.exp.paper.config import tf_colors
hv.extension('bokeh')
Using TensorFlow backend.
In [6]:
from basepair.plot.profiles import multiple_plot_stranded_profile
from basepair.plot.table import render_mpl_table
from basepair.exp.paper.fig4 import *
from basepair.exp.paper.config import *
In [7]:
# Use paper config
paper_config()
In [8]:
tasks = ['Oct4', 'Sox2', 'Nanog', 'Klf4']
In [9]:
model_dir = models_dir / exp
figures = Path(f"{ddir}/figures/modisco/{exp}")
In [10]:
mr_dict = [(task, ModiscoResult(model_dir / f"deeplift/{task}/out/{imp_score}/modisco.h5"))
           for task in tasks]
In [11]:
mr = MultipleModiscoResult({task: model_dir / f"deeplift/{task}/out/{imp_score}/modisco.h5"
                           for task in tasks})
In [12]:
mrp = mr_dict[0][1]
In [13]:
{k: v.sum() for k,v in mrp.get_pattern(mrp.patterns()[0]).contrib.items()}
Out[13]:
{'Oct4': 0.46344988933744874}
In [14]:
{k: v.sum() for k,v in mr.get_pattern(mr.patterns()[0]).contrib.items()}
Out[14]:
{'Oct4': 0.46344988933744874}
In [15]:
isf = ImpScoreFile(model_dir / 'deeplift.imp_score.h5')
In [16]:
len(isf)
Out[16]:
147974
In [17]:
for k,v in mr_dict: v.open()
In [18]:
footprints = {t: read_pkl(model_dir / f"deeplift/{t}/out/{imp_score}/footprints.pkl")
              for t in tasks}

Create the table

In [13]:
df = pd.DataFrame([{"TF": t, **mr.stats()} for t,mr in mr_dict])
TF-MoDISco is using the TensorFlow backend.
# seqlets assigned to patterns: 19714 / 43216 (46.0%)
# seqlets assigned to patterns: 7255 / 17735 (41.0%)
# seqlets assigned to patterns: 13463 / 101377 (13.0%)
# seqlets assigned to patterns: 16180 / 104209 (16.0%)
In [14]:
df[['%clustered']] = np.round(df[['clustered_seqlets_frac']] * 100, decimals=1)
In [15]:
dfp = df[['TF', 'all_seqlets', 'clustered_seqlets', '%clustered', 'patterns']]
dfp
Out[15]:
TF all_seqlets clustered_seqlets %clustered patterns
0 Oct4 43216 19714 45.6 19
1 Sox2 17735 7255 40.9 10
2 Nanog 101377 13463 13.3 19
3 Klf4 104209 16180 15.5 13
In [59]:
render_mpl_table(dfp, col_width=1.8, 
                 row_height=0.3,
                 header_color='white', font_size=10,
                 row_colors = ['w'],
                 header_text_color='black')
plt.savefig(figures / 'seqlet-stats.pdf')

Plot all the motifs

Code

In [16]:
optimal_align_patterns??
Object `optimal_align_patterns` not found.
In [19]:
def cluster_align_patterns(patterns, n_clusters=9):
    # get the similarity matrix
    sim_seq = similarity_matrix(patterns, track='seq_ic')

    # cluster
    iu = np.triu_indices(len(sim_seq), 1)
    lm_seq = linkage(1 - sim_seq[iu], 'ward', optimal_ordering=True)

    # determine clusters and the cluster order
    cluster = cut_tree(lm_seq, n_clusters=n_clusters)[:, 0]
    cluster_order = np.argsort(leaves_list(lm_seq))

    # align patterns
    patterns_clustered = align_clustered_patterns(patterns, cluster_order, cluster,
                                                  align_track='seq_ic',
                                                  metric='continousjaccard',
                                                  # don't shit the major patterns
                                                  # by more than 15 when aligning
                                                  trials=20,
                                                  max_shift=15)
    
    return sim_seq, lm_seq, np.argsort(cluster_order), patterns_clustered
In [28]:
all_patterns = get_patterns(mr_dict, footprints, tasks, min_n_seqlets=100)
patterns = [p for p in all_patterns if p.seq_info_content > 4]
short_patterns = [p for p in patterns if p.seq_info_content < 30]
sim_seq, lm_seq, sort_idx, short_patterns_clustered = cluster_align_patterns(short_patterns, n_clusters=11)
pattern_thr_100 = get_patterns(mr_dict, footprints, tasks, min_n_seqlets=100)
long_patterns = [p for p in pattern_thr_100 if p.seq_info_content >= 30]
long_patterns_clustered = cluster_align_patterns(long_patterns, n_clusters=25)
100%|██████████| 33/33 [00:04<00:00,  8.26it/s]
100%|██████████| 33/33 [00:00<00:00, 240.96it/s]
100%|██████████| 18/18 [00:01<00:00, 14.85it/s]
100%|██████████| 18/18 [00:00<00:00, 247.23it/s]
In [29]:
len(patterns)
Out[29]:
51
In [30]:
len(long_patterns)
Out[30]:
18
In [31]:
len(short_patterns)
Out[31]:
33
In [130]:
# all_patterns = get_patterns(mr_dict, footprints, tasks, min_n_seqlets=100)
# sim_seq, lm_seq, sort_idx, all_patterns_clustered = cluster_align_patterns(all_patterns, n_clusters=25)
100%|██████████| 56/56 [00:10<00:00,  5.05it/s]
100%|██████████| 56/56 [00:00<00:00, 241.32it/s]
In [124]:
# all_patterns = get_patterns(mr_dict, footprints, tasks, min_n_seqlets=100)
# patterns = [p for p in all_patterns if p.seq_info_content > 4]
# short_patterns = [p for p in patterns if p.seq_info_content < 30]
# sim_seq, lm_seq, sort_idx, short_patterns_clustered = cluster_align_patterns(short_patterns, n_clusters=10)
100%|██████████| 24/24 [00:02<00:00,  9.99it/s]
100%|██████████| 24/24 [00:00<00:00, 254.67it/s]
In [49]:
short_patterns = [p.rename(p.attrs['TF'] + '/' + p.name) for p in short_patterns]
In [50]:
short_patterns_clustered = [p.rename(p.attrs['TF'] + '/' + p.name) for p in short_patterns_clustered]
In [51]:
len(short_patterns)
Out[51]:
33
In [52]:
len(long_patterns)
Out[52]:
18
In [53]:
def short_name(pn):
    task, mc, p = pn.split("/")
    return task + "/p" + p.split("_")[1]
In [54]:
p = short_patterns_clustered[0]
In [109]:
for i in range(len(short_patterns_clustered)):
    short_patterns_clustered[i].trim_seq_ic(0.08).plot('seq_ic', height=0.4, letter_width=0.12);
    strip_axis(plt.gca())
In [101]:
pnames = [short_name(p.name) for p in short_patterns]
df = pd.DataFrame(sim_seq, 
                 index=pnames,
                 columns=pnames)
In [128]:
cm = sns.clustermap(df, 
                    row_linkage=lm_seq,
                    col_linkage=lm_seq,
                    cmap='Blues', figsize=get_figsize(.9, 1))
plt.savefig(figures / 'cluster.short-motifs.heatmap.100seqlets.pdf')

Long motif clustering

In [20]:
long_patterns = [p.add_attr('features', {"n seqlets": mr.n_seqlets(p.name)}) for p in mr.get_all_patterns() 
                  if p.seq_info_content >= 30 and mr.n_seqlets(p.name) > 100]
TF-MoDISco is using the TensorFlow backend.
In [28]:
sim_seq, lm_seq, sort_idx, long_patterns_clustered = cluster_align_patterns(long_patterns, n_clusters=len(long_patterns))
100%|██████████| 18/18 [00:01<00:00, 13.92it/s]
100%|██████████| 18/18 [00:00<00:00, 226.56it/s]
In [29]:
cluster_align_patterns
Out[29]:
<function __main__.cluster_align_patterns(patterns, n_clusters=9)>
In [30]:
def short_name(pn):
    task, mc, p = pn.split("/")
    return task + "/p" + p.split("_")[1]
In [31]:
pnames = [short_name(p.name) for p in long_patterns]
long_patterns = [p.rename(short_name(p.name)) for p in long_patterns]
In [33]:
df = pd.DataFrame(sim_seq, 
                 index=pnames,
                 columns=pnames)
In [34]:
for i in range(len(long_patterns_clustered)):
    long_patterns_clustered[i].trim_seq_ic(0.08).plot('seq_ic', height=0.4, letter_width=0.12);
    strip_axis(plt.gca())
In [35]:
cm = sns.clustermap(df, 
                    row_linkage=lm_seq,
                    col_linkage=lm_seq,
                    cmap='Blues', figsize=get_figsize(.5, 1))
plt.savefig(figures / 'cluster.long-motifs.heatmap.pdf')
In [36]:
# Note -> patterns with higher Sequence IC used a lower threshold (min_n_seqlets > 100)
In [79]:
# non-bias corrected
fig, ax = plt.subplots(figsize=get_figsize(.25))
ax.hist([p.seq_info_content for p in patterns + long_patterns], 20);

ax.set_ylabel("Frequency");
ax.set_xlabel("Sequence IC");
fig.savefig(figures / 'pattern-length.pdf', bbox_inches='tight', pad_inches=0)
In [162]:
paper_config()
In [163]:
# Non-bias corrected
fig = plot_patterns(list(reversed(short_patterns_clustered)), tasks, pattern_trim=(27, 48))
fig.savefig(figures / 'short-patterns.table.100seqlets-v2.pdf')
fig.savefig(figures / 'short-patterns.table.100seqlets-v2.png')
In [55]:
short_patterns_rev = list(reversed(short_patterns_clustered))
In [56]:
# dfp = pd.read_csv(f"{model_dir}/deeplift/{task}/out/{imp_score}/pattern_table.csv")
In [ ]:
 
In [57]:
short_patterns_rev_list = [p.name for p in short_patterns_rev]
In [58]:
len(short_patterns_rev_list)
Out[58]:
33
In [59]:
dfp_list = []
for task in tasks:
    dfp = pd.read_csv(f"{model_dir}/deeplift/{task}/out/{imp_score}/pattern_table.csv")
    dfp['task'] = task
    dfp['pattern_name'] = task + '/' + dfp['pattern'].map(longer_pattern)
    dfp_list.append(dfp)
dfpl = pd.concat(dfp_list)
In [60]:
dfpl.head()
Out[60]:
Klf4 footprint counts Klf4 footprint entropydiff Klf4 footprint max Klf4 footprint strandcor Klf4 imp counts Klf4 imp profile Klf4 periodicity 10bp Klf4 pos meandiff Klf4 pos std Klf4 pos unimodal Klf4 region counts Nanog footprint counts Nanog footprint entropydiff Nanog footprint max Nanog footprint strandcor Nanog imp counts Nanog imp profile Nanog periodicity 10bp Nanog pos meandiff Nanog pos std Nanog pos unimodal Nanog region counts Oct4 footprint counts Oct4 footprint entropydiff Oct4 footprint max Oct4 footprint strandcor Oct4 imp counts Oct4 imp profile Oct4 periodicity 10bp Oct4 pos meandiff Oct4 pos std Oct4 pos unimodal Oct4 region counts Sox2 footprint counts Sox2 footprint entropydiff Sox2 footprint max Sox2 footprint strandcor Sox2 imp counts Sox2 imp profile Sox2 periodicity 10bp Sox2 pos meandiff Sox2 pos std Sox2 pos unimodal Sox2 region counts consensus ic pwm mean idx n seqlets pattern pattern_name task
0 143.6074 0.0257 0.5459 0.0051 0.0203 0.0105 0.0686 NaN NaN NaN 402.9995 473.8224 0.0760 2.3823 0.0053 0.0538 0.0236 0.3644 NaN NaN NaN 956.8774 263.7851 0.1400 2.9668 0.0057 0.0672 0.0255 0.0679 1.9860 121.6666 True 572.4199 88.1211 0.1908 1.1994 0.0060 0.0480 0.0194 0.1548 NaN NaN NaN 205.1461 CCTTTGTTATGCAAAT 0.8710 0 10742 m0_p0 Oct4/metacluster_0/pa... Oct4
1 149.8835 0.0166 0.5083 0.0050 0.0076 0.0053 0.1108 NaN NaN NaN 429.0092 364.6005 0.0323 1.3482 0.0050 0.0212 0.0110 0.1445 NaN NaN NaN 822.5327 184.1158 0.0282 0.7830 0.0051 0.0301 0.0136 0.0723 1.9790 122.8195 True 472.7943 53.2370 0.0244 0.2550 0.0050 0.0186 0.0068 0.1040 NaN NaN NaN 159.0560 TTATGCAAAT 1.0574 1 1517 m0_p1 Oct4/metacluster_0/pa... Oct4
2 152.2931 0.0188 0.5257 0.0050 0.0090 0.0061 0.0577 NaN NaN NaN 428.6114 403.7701 0.0323 1.5600 0.0051 0.0278 0.0145 0.2506 NaN NaN NaN 870.4996 201.9634 0.0427 0.9988 0.0051 0.0336 0.0146 0.0299 2.0021 118.7279 True 502.7864 60.7334 0.0370 0.3309 0.0050 0.0227 0.0087 0.0897 NaN NaN NaN 170.4241 ATATGCAAAT 1.1637 2 1297 m0_p2 Oct4/metacluster_0/pa... Oct4
3 240.1971 0.0313 1.0273 0.0051 0.0121 0.0074 0.0540 NaN NaN NaN 570.9031 1102.3789 0.1304 6.9842 0.0054 0.0502 0.0264 0.4378 NaN NaN NaN 1782.2529 240.9923 0.0220 0.9737 0.0050 0.0267 0.0073 0.0769 1.9750 106.9080 True 554.0219 152.1856 0.1517 1.9368 0.0058 0.0291 0.0174 0.2354 NaN NaN NaN 295.8698 CCTTTGTTCT 0.9832 3 1052 m0_p3 Oct4/metacluster_0/pa... Oct4
4 347.7702 0.1958 3.1925 0.0059 0.0673 0.0439 0.0247 NaN NaN NaN 738.7773 577.4963 0.0324 2.5325 0.0050 0.0152 0.0110 0.1467 NaN NaN NaN 1245.0659 197.0388 0.0409 1.0194 0.0049 0.0202 0.0068 0.0381 1.9809 176.7330 True 526.9279 75.3001 0.0351 0.3578 0.0050 0.0174 0.0061 0.0606 NaN NaN NaN 202.1866 GCCACACCCAG 1.0765 4 970 m0_p4 Oct4/metacluster_0/pa... Oct4
In [61]:
dfpl.set_index("pattern_name", inplace=True)
In [62]:
dfpl
Out[62]:
Klf4 footprint counts Klf4 footprint entropydiff Klf4 footprint max Klf4 footprint strandcor Klf4 imp counts Klf4 imp profile Klf4 periodicity 10bp Klf4 pos meandiff Klf4 pos std Klf4 pos unimodal Klf4 region counts Nanog footprint counts Nanog footprint entropydiff Nanog footprint max Nanog footprint strandcor Nanog imp counts Nanog imp profile Nanog periodicity 10bp Nanog pos meandiff Nanog pos std Nanog pos unimodal Nanog region counts Oct4 footprint counts Oct4 footprint entropydiff Oct4 footprint max Oct4 footprint strandcor Oct4 imp counts Oct4 imp profile Oct4 periodicity 10bp Oct4 pos meandiff Oct4 pos std Oct4 pos unimodal Oct4 region counts Sox2 footprint counts Sox2 footprint entropydiff Sox2 footprint max Sox2 footprint strandcor Sox2 imp counts Sox2 imp profile Sox2 periodicity 10bp Sox2 pos meandiff Sox2 pos std Sox2 pos unimodal Sox2 region counts consensus ic pwm mean idx n seqlets pattern task
pattern_name
Oct4/metacluster_0/pattern_0 143.6074 0.0257 0.5459 0.0051 0.0203 0.0105 0.0686 NaN NaN NaN 402.9995 473.8224 0.0760 2.3823 0.0053 0.0538 0.0236 0.3644 NaN NaN NaN 956.8774 263.7851 0.1400 2.9668 0.0057 0.0672 0.0255 0.0679 1.9860 121.6666 True 572.4199 88.1211 0.1908 1.1994 0.0060 0.0480 0.0194 0.1548 NaN NaN NaN 205.1461 CCTTTGTTATGCAAAT 0.8710 0 10742 m0_p0 Oct4
Oct4/metacluster_0/pattern_1 149.8835 0.0166 0.5083 0.0050 0.0076 0.0053 0.1108 NaN NaN NaN 429.0092 364.6005 0.0323 1.3482 0.0050 0.0212 0.0110 0.1445 NaN NaN NaN 822.5327 184.1158 0.0282 0.7830 0.0051 0.0301 0.0136 0.0723 1.9790 122.8195 True 472.7943 53.2370 0.0244 0.2550 0.0050 0.0186 0.0068 0.1040 NaN NaN NaN 159.0560 TTATGCAAAT 1.0574 1 1517 m0_p1 Oct4
Oct4/metacluster_0/pattern_2 152.2931 0.0188 0.5257 0.0050 0.0090 0.0061 0.0577 NaN NaN NaN 428.6114 403.7701 0.0323 1.5600 0.0051 0.0278 0.0145 0.2506 NaN NaN NaN 870.4996 201.9634 0.0427 0.9988 0.0051 0.0336 0.0146 0.0299 2.0021 118.7279 True 502.7864 60.7334 0.0370 0.3309 0.0050 0.0227 0.0087 0.0897 NaN NaN NaN 170.4241 ATATGCAAAT 1.1637 2 1297 m0_p2 Oct4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Klf4/metacluster_0/pattern_10 230.7921 0.1216 1.5140 0.0052 0.0137 0.0074 0.0327 2.0074 94.6669 True 516.2514 145.6011 0.3354 1.3511 0.0047 0.0024 0.0021 0.0252 NaN NaN NaN 493.3911 79.9438 0.1969 0.4972 0.0048 0.0033 0.0011 0.0656 NaN NaN NaN 284.4637 31.9663 0.1982 0.2612 0.0049 0.0025 0.0008 0.0684 NaN NaN NaN 126.3464 AAACCCACACCCACAGTGACA... 0.9222 10 179 m0_p10 Klf4
Klf4/metacluster_0/pattern_11 221.7812 0.1008 1.1219 0.0053 0.0254 0.0103 0.0352 1.9735 192.2644 True 627.0230 93.6062 0.1084 0.7281 0.0051 0.0111 0.0041 0.0590 NaN NaN NaN 346.6437 1250.4874 0.3764 8.1375 0.0067 0.0500 0.0410 0.0340 NaN NaN NaN 1931.6782 75.0000 0.1785 0.4625 0.0057 0.0246 0.0120 0.0435 NaN NaN NaN 197.9253 CGAGAGGTCCCGGGTTCGAAC... 0.5855 11 174 m0_p11 Klf4
Klf4/metacluster_0/pattern_12 197.2323 0.2965 1.7121 0.0062 0.0085 0.0046 0.1084 1.9143 167.7523 True 508.1359 187.1414 0.2322 1.5707 0.0046 0.0055 0.0035 0.2612 NaN NaN NaN 566.1359 86.7273 0.1879 0.5404 0.0048 0.0043 0.0016 0.0469 NaN NaN NaN 327.8350 32.5253 0.2590 0.2172 0.0046 0.0038 0.0016 0.1403 NaN NaN NaN 133.4757 CCTAAAAAAAGCGTCAAAGAA... 0.5912 12 103 m0_p12 Klf4

61 rows × 50 columns

In [63]:
dfp_imp = dfpl.loc[short_patterns_rev_list, [t + ' imp profile' for t in tasks]]
In [64]:
dfp_imp.columns = tasks
In [96]:
fig = plt.figure(figsize=get_figsize(0.5, aspect=2.5))
sns.heatmap(dfp_imp, cmap='Blues');
fig.savefig(figures / 'short-patterns.contrib_score.pdf')

Subset to patterns shown in Figure 4

In [66]:
df_subset = pd.read_csv("https://docs.google.com/spreadsheets/d/1MVBUbMs_F0UPgZxR8-v9eMM6DCFhVYc7kdH-C6WJnYU/export?gid=0&format=csv")
df_subset.name = df_subset.name.str.replace('"', '')
df_subset.task = df_subset.task.str.replace('"', '')
In [67]:
df_subset.head()
Out[67]:
name task n_seqlets pattern pattern_id
0 Oct4-Sox2 Oct4 10742 m0_p0 0
1 Oct4 Oct4 1517 m0_p1 1
2 Oct4-Oct4 Oct4 571 m0_p6 6
3 Sox2 Sox2 2009 m0_p1 1
4 Nanog Nanog 2981 m0_p1 1
In [68]:
df_subset['long_pattern'] = df_subset.task + "/" + df_subset.pattern.map(longer_pattern)
In [69]:
paper_config()
In [70]:
fig = plt.figure(figsize=get_figsize(0.5, aspect=2.5))
sns.heatmap(dfp_imp.loc[df_subset.long_pattern.values], cmap='Blues');
fig.savefig(figures / 'short-patterns.contrib_score.figure-subset.pdf')
In [ ]:
 
In [28]:
p = short_patterns_rev[0]
In [38]:
contrib_scores = np.empty((len(short_patterns_rev), len(tasks)))
for i,p in enumerate(short_patterns_rev):
    for j, task in enumerate(tasks):
        contrib_scores[i,j] = p.contrib[task].sum()
In [50]:
short_patterns_rev[8].plot('contrib');
In [40]:
sns.heatmap(contrib_scores);
In [ ]:
# create the table with the histogram
In [133]:
# Non-bias corrected
fig = plot_patterns(list(reversed(all_patterns_clustered)), tasks, pattern_trim=(5, 65))
fig.savefig(figures / 'all-patterns.table.pdf')
fig.savefig(figures / 'all-patterns.table.png')