exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'

Goal¶

plot the per-tf motifs as a table

# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
from basepair.plot.profiles import multiple_plot_stranded_profile
from basepair.exp.paper.config import tf_colors
hv.extension('bokeh')

Using TensorFlow backend.

from basepair.plot.profiles import multiple_plot_stranded_profile
from basepair.plot.table import render_mpl_table
from basepair.exp.paper.fig4 import *
from basepair.exp.paper.config import *

# Use paper config
paper_config()

tasks = ['Oct4', 'Sox2', 'Nanog', 'Klf4']

model_dir = models_dir / exp
figures = Path(f"{ddir}/figures/modisco/{exp}")

mr_dict = [(task, ModiscoResult(model_dir / f"deeplift/{task}/out/{imp_score}/modisco.h5"))
           for task in tasks]

mr = MultipleModiscoResult({task: model_dir / f"deeplift/{task}/out/{imp_score}/modisco.h5"
                           for task in tasks})

mrp = mr_dict[0][1]

{k: v.sum() for k,v in mrp.get_pattern(mrp.patterns()[0]).contrib.items()}

{'Oct4': 0.46344988933744874}

{k: v.sum() for k,v in mr.get_pattern(mr.patterns()[0]).contrib.items()}

{'Oct4': 0.46344988933744874}

isf = ImpScoreFile(model_dir / 'deeplift.imp_score.h5')

len(isf)

147974

for k,v in mr_dict: v.open()

footprints = {t: read_pkl(model_dir / f"deeplift/{t}/out/{imp_score}/footprints.pkl")
              for t in tasks}

Create the table¶

df = pd.DataFrame([{"TF": t, **mr.stats()} for t,mr in mr_dict])

TF-MoDISco is using the TensorFlow backend.

# seqlets assigned to patterns: 19714 / 43216 (46.0%)
# seqlets assigned to patterns: 7255 / 17735 (41.0%)
# seqlets assigned to patterns: 13463 / 101377 (13.0%)
# seqlets assigned to patterns: 16180 / 104209 (16.0%)

df[['%clustered']] = np.round(df[['clustered_seqlets_frac']] * 100, decimals=1)

dfp = df[['TF', 'all_seqlets', 'clustered_seqlets', '%clustered', 'patterns']]
dfp

render_mpl_table(dfp, col_width=1.8, 
                 row_height=0.3,
                 header_color='white', font_size=10,
                 row_colors = ['w'],
                 header_text_color='black')
plt.savefig(figures / 'seqlet-stats.pdf')

Plot all the motifs¶

Code¶

optimal_align_patterns??

Object `optimal_align_patterns` not found.

def cluster_align_patterns(patterns, n_clusters=9):
    # get the similarity matrix
    sim_seq = similarity_matrix(patterns, track='seq_ic')

    # cluster
    iu = np.triu_indices(len(sim_seq), 1)
    lm_seq = linkage(1 - sim_seq[iu], 'ward', optimal_ordering=True)

    # determine clusters and the cluster order
    cluster = cut_tree(lm_seq, n_clusters=n_clusters)[:, 0]
    cluster_order = np.argsort(leaves_list(lm_seq))

    # align patterns
    patterns_clustered = align_clustered_patterns(patterns, cluster_order, cluster,
                                                  align_track='seq_ic',
                                                  metric='continousjaccard',
                                                  # don't shit the major patterns
                                                  # by more than 15 when aligning
                                                  trials=20,
                                                  max_shift=15)
    
    return sim_seq, lm_seq, np.argsort(cluster_order), patterns_clustered

all_patterns = get_patterns(mr_dict, footprints, tasks, min_n_seqlets=100)
patterns = [p for p in all_patterns if p.seq_info_content > 4]
short_patterns = [p for p in patterns if p.seq_info_content < 30]
sim_seq, lm_seq, sort_idx, short_patterns_clustered = cluster_align_patterns(short_patterns, n_clusters=11)
pattern_thr_100 = get_patterns(mr_dict, footprints, tasks, min_n_seqlets=100)
long_patterns = [p for p in pattern_thr_100 if p.seq_info_content >= 30]
long_patterns_clustered = cluster_align_patterns(long_patterns, n_clusters=25)

100%|██████████| 33/33 [00:04<00:00,  8.26it/s]
100%|██████████| 33/33 [00:00<00:00, 240.96it/s]
100%|██████████| 18/18 [00:01<00:00, 14.85it/s]
100%|██████████| 18/18 [00:00<00:00, 247.23it/s]

len(patterns)

51

len(long_patterns)

18

len(short_patterns)

33

# all_patterns = get_patterns(mr_dict, footprints, tasks, min_n_seqlets=100)
# sim_seq, lm_seq, sort_idx, all_patterns_clustered = cluster_align_patterns(all_patterns, n_clusters=25)

100%|██████████| 56/56 [00:10<00:00,  5.05it/s]
100%|██████████| 56/56 [00:00<00:00, 241.32it/s]

# all_patterns = get_patterns(mr_dict, footprints, tasks, min_n_seqlets=100)
# patterns = [p for p in all_patterns if p.seq_info_content > 4]
# short_patterns = [p for p in patterns if p.seq_info_content < 30]
# sim_seq, lm_seq, sort_idx, short_patterns_clustered = cluster_align_patterns(short_patterns, n_clusters=10)

100%|██████████| 24/24 [00:02<00:00,  9.99it/s]
100%|██████████| 24/24 [00:00<00:00, 254.67it/s]

short_patterns = [p.rename(p.attrs['TF'] + '/' + p.name) for p in short_patterns]

short_patterns_clustered = [p.rename(p.attrs['TF'] + '/' + p.name) for p in short_patterns_clustered]

len(short_patterns)

33

len(long_patterns)

18

def short_name(pn):
    task, mc, p = pn.split("/")
    return task + "/p" + p.split("_")[1]

p = short_patterns_clustered[0]

for i in range(len(short_patterns_clustered)):
    short_patterns_clustered[i].trim_seq_ic(0.08).plot('seq_ic', height=0.4, letter_width=0.12);
    strip_axis(plt.gca())

pnames = [short_name(p.name) for p in short_patterns]
df = pd.DataFrame(sim_seq, 
                 index=pnames,
                 columns=pnames)

cm = sns.clustermap(df, 
                    row_linkage=lm_seq,
                    col_linkage=lm_seq,
                    cmap='Blues', figsize=get_figsize(.9, 1))
plt.savefig(figures / 'cluster.short-motifs.heatmap.100seqlets.pdf')

Long motif clustering¶

long_patterns = [p.add_attr('features', {"n seqlets": mr.n_seqlets(p.name)}) for p in mr.get_all_patterns() 
                  if p.seq_info_content >= 30 and mr.n_seqlets(p.name) > 100]

TF-MoDISco is using the TensorFlow backend.

sim_seq, lm_seq, sort_idx, long_patterns_clustered = cluster_align_patterns(long_patterns, n_clusters=len(long_patterns))

100%|██████████| 18/18 [00:01<00:00, 13.92it/s]
100%|██████████| 18/18 [00:00<00:00, 226.56it/s]

cluster_align_patterns

<function __main__.cluster_align_patterns(patterns, n_clusters=9)>

def short_name(pn):
    task, mc, p = pn.split("/")
    return task + "/p" + p.split("_")[1]

pnames = [short_name(p.name) for p in long_patterns]
long_patterns = [p.rename(short_name(p.name)) for p in long_patterns]

df = pd.DataFrame(sim_seq, 
                 index=pnames,
                 columns=pnames)

for i in range(len(long_patterns_clustered)):
    long_patterns_clustered[i].trim_seq_ic(0.08).plot('seq_ic', height=0.4, letter_width=0.12);
    strip_axis(plt.gca())

cm = sns.clustermap(df, 
                    row_linkage=lm_seq,
                    col_linkage=lm_seq,
                    cmap='Blues', figsize=get_figsize(.5, 1))
plt.savefig(figures / 'cluster.long-motifs.heatmap.pdf')

# Note -> patterns with higher Sequence IC used a lower threshold (min_n_seqlets > 100)

# non-bias corrected
fig, ax = plt.subplots(figsize=get_figsize(.25))
ax.hist([p.seq_info_content for p in patterns + long_patterns], 20);

ax.set_ylabel("Frequency");
ax.set_xlabel("Sequence IC");
fig.savefig(figures / 'pattern-length.pdf', bbox_inches='tight', pad_inches=0)

paper_config()

# Non-bias corrected
fig = plot_patterns(list(reversed(short_patterns_clustered)), tasks, pattern_trim=(27, 48))
fig.savefig(figures / 'short-patterns.table.100seqlets-v2.pdf')
fig.savefig(figures / 'short-patterns.table.100seqlets-v2.png')

short_patterns_rev = list(reversed(short_patterns_clustered))

# dfp = pd.read_csv(f"{model_dir}/deeplift/{task}/out/{imp_score}/pattern_table.csv")

short_patterns_rev_list = [p.name for p in short_patterns_rev]

len(short_patterns_rev_list)

33

dfp_list = []
for task in tasks:
    dfp = pd.read_csv(f"{model_dir}/deeplift/{task}/out/{imp_score}/pattern_table.csv")
    dfp['task'] = task
    dfp['pattern_name'] = task + '/' + dfp['pattern'].map(longer_pattern)
    dfp_list.append(dfp)
dfpl = pd.concat(dfp_list)

dfpl.head()

dfpl.set_index("pattern_name", inplace=True)

dfpl

dfp_imp = dfpl.loc[short_patterns_rev_list, [t + ' imp profile' for t in tasks]]

dfp_imp.columns = tasks

fig = plt.figure(figsize=get_figsize(0.5, aspect=2.5))
sns.heatmap(dfp_imp, cmap='Blues');
fig.savefig(figures / 'short-patterns.contrib_score.pdf')

Subset to patterns shown in Figure 4

df_subset = pd.read_csv("https://docs.google.com/spreadsheets/d/1MVBUbMs_F0UPgZxR8-v9eMM6DCFhVYc7kdH-C6WJnYU/export?gid=0&format=csv")
df_subset.name = df_subset.name.str.replace('"', '')
df_subset.task = df_subset.task.str.replace('"', '')

df_subset.head()

df_subset['long_pattern'] = df_subset.task + "/" + df_subset.pattern.map(longer_pattern)

paper_config()

fig = plt.figure(figsize=get_figsize(0.5, aspect=2.5))
sns.heatmap(dfp_imp.loc[df_subset.long_pattern.values], cmap='Blues');
fig.savefig(figures / 'short-patterns.contrib_score.figure-subset.pdf')

p = short_patterns_rev[0]

contrib_scores = np.empty((len(short_patterns_rev), len(tasks)))
for i,p in enumerate(short_patterns_rev):
    for j, task in enumerate(tasks):
        contrib_scores[i,j] = p.contrib[task].sum()

short_patterns_rev[8].plot('contrib');

sns.heatmap(contrib_scores);

# create the table with the histogram

# Non-bias corrected
fig = plot_patterns(list(reversed(all_patterns_clustered)), tasks, pattern_trim=(5, 65))
fig.savefig(figures / 'all-patterns.table.pdf')
fig.savefig(figures / 'all-patterns.table.png')

	Klf4 footprint counts	Klf4 footprint entropydiff	Klf4 footprint max	Klf4 footprint strandcor	Klf4 imp counts	Klf4 imp profile	Klf4 periodicity 10bp	Klf4 pos meandiff	Klf4 pos std	Klf4 pos unimodal	Klf4 region counts	Nanog footprint counts	Nanog footprint entropydiff	Nanog footprint max	Nanog footprint strandcor	Nanog imp counts	Nanog imp profile	Nanog periodicity 10bp	Nanog pos meandiff	Nanog pos std	Nanog pos unimodal	Nanog region counts	Oct4 footprint counts	Oct4 footprint entropydiff	Oct4 footprint max	Oct4 footprint strandcor	Oct4 imp counts	Oct4 imp profile	Oct4 periodicity 10bp	Oct4 pos meandiff	Oct4 pos std	Oct4 pos unimodal	Oct4 region counts	Sox2 footprint counts	Sox2 footprint entropydiff	Sox2 footprint max	Sox2 footprint strandcor	Sox2 imp counts	Sox2 imp profile	Sox2 periodicity 10bp	Sox2 pos meandiff	Sox2 pos std	Sox2 pos unimodal	Sox2 region counts	consensus	ic pwm mean	idx	n seqlets	pattern	pattern_name	task
0	143.6074	0.0257	0.5459	0.0051	0.0203	0.0105	0.0686	NaN	NaN	NaN	402.9995	473.8224	0.0760	2.3823	0.0053	0.0538	0.0236	0.3644	NaN	NaN	NaN	956.8774	263.7851	0.1400	2.9668	0.0057	0.0672	0.0255	0.0679	1.9860	121.6666	True	572.4199	88.1211	0.1908	1.1994	0.0060	0.0480	0.0194	0.1548	NaN	NaN	NaN	205.1461	CCTTTGTTATGCAAAT	0.8710	0	10742	m0_p0	Oct4/metacluster_0/pa...	Oct4
1	149.8835	0.0166	0.5083	0.0050	0.0076	0.0053	0.1108	NaN	NaN	NaN	429.0092	364.6005	0.0323	1.3482	0.0050	0.0212	0.0110	0.1445	NaN	NaN	NaN	822.5327	184.1158	0.0282	0.7830	0.0051	0.0301	0.0136	0.0723	1.9790	122.8195	True	472.7943	53.2370	0.0244	0.2550	0.0050	0.0186	0.0068	0.1040	NaN	NaN	NaN	159.0560	TTATGCAAAT	1.0574	1	1517	m0_p1	Oct4/metacluster_0/pa...	Oct4
2	152.2931	0.0188	0.5257	0.0050	0.0090	0.0061	0.0577	NaN	NaN	NaN	428.6114	403.7701	0.0323	1.5600	0.0051	0.0278	0.0145	0.2506	NaN	NaN	NaN	870.4996	201.9634	0.0427	0.9988	0.0051	0.0336	0.0146	0.0299	2.0021	118.7279	True	502.7864	60.7334	0.0370	0.3309	0.0050	0.0227	0.0087	0.0897	NaN	NaN	NaN	170.4241	ATATGCAAAT	1.1637	2	1297	m0_p2	Oct4/metacluster_0/pa...	Oct4
3	240.1971	0.0313	1.0273	0.0051	0.0121	0.0074	0.0540	NaN	NaN	NaN	570.9031	1102.3789	0.1304	6.9842	0.0054	0.0502	0.0264	0.4378	NaN	NaN	NaN	1782.2529	240.9923	0.0220	0.9737	0.0050	0.0267	0.0073	0.0769	1.9750	106.9080	True	554.0219	152.1856	0.1517	1.9368	0.0058	0.0291	0.0174	0.2354	NaN	NaN	NaN	295.8698	CCTTTGTTCT	0.9832	3	1052	m0_p3	Oct4/metacluster_0/pa...	Oct4
4	347.7702	0.1958	3.1925	0.0059	0.0673	0.0439	0.0247	NaN	NaN	NaN	738.7773	577.4963	0.0324	2.5325	0.0050	0.0152	0.0110	0.1467	NaN	NaN	NaN	1245.0659	197.0388	0.0409	1.0194	0.0049	0.0202	0.0068	0.0381	1.9809	176.7330	True	526.9279	75.3001	0.0351	0.3578	0.0050	0.0174	0.0061	0.0606	NaN	NaN	NaN	202.1866	GCCACACCCAG	1.0765	4	970	m0_p4	Oct4/metacluster_0/pa...	Oct4

	Klf4 footprint counts	Klf4 footprint entropydiff	Klf4 footprint max	Klf4 footprint strandcor	Klf4 imp counts	Klf4 imp profile	Klf4 periodicity 10bp	Klf4 pos meandiff	Klf4 pos std	Klf4 pos unimodal	Klf4 region counts	Nanog footprint counts	Nanog footprint entropydiff	Nanog footprint max	Nanog footprint strandcor	Nanog imp counts	Nanog imp profile	Nanog periodicity 10bp	Nanog pos meandiff	Nanog pos std	Nanog pos unimodal	Nanog region counts	Oct4 footprint counts	Oct4 footprint entropydiff	Oct4 footprint max	Oct4 footprint strandcor	Oct4 imp counts	Oct4 imp profile	Oct4 periodicity 10bp	Oct4 pos meandiff	Oct4 pos std	Oct4 pos unimodal	Oct4 region counts	Sox2 footprint counts	Sox2 footprint entropydiff	Sox2 footprint max	Sox2 footprint strandcor	Sox2 imp counts	Sox2 imp profile	Sox2 periodicity 10bp	Sox2 pos meandiff	Sox2 pos std	Sox2 pos unimodal	Sox2 region counts	consensus	ic pwm mean	idx	n seqlets	pattern	task
pattern_name
Oct4/metacluster_0/pattern_0	143.6074	0.0257	0.5459	0.0051	0.0203	0.0105	0.0686	NaN	NaN	NaN	402.9995	473.8224	0.0760	2.3823	0.0053	0.0538	0.0236	0.3644	NaN	NaN	NaN	956.8774	263.7851	0.1400	2.9668	0.0057	0.0672	0.0255	0.0679	1.9860	121.6666	True	572.4199	88.1211	0.1908	1.1994	0.0060	0.0480	0.0194	0.1548	NaN	NaN	NaN	205.1461	CCTTTGTTATGCAAAT	0.8710	0	10742	m0_p0	Oct4
Oct4/metacluster_0/pattern_1	149.8835	0.0166	0.5083	0.0050	0.0076	0.0053	0.1108	NaN	NaN	NaN	429.0092	364.6005	0.0323	1.3482	0.0050	0.0212	0.0110	0.1445	NaN	NaN	NaN	822.5327	184.1158	0.0282	0.7830	0.0051	0.0301	0.0136	0.0723	1.9790	122.8195	True	472.7943	53.2370	0.0244	0.2550	0.0050	0.0186	0.0068	0.1040	NaN	NaN	NaN	159.0560	TTATGCAAAT	1.0574	1	1517	m0_p1	Oct4
Oct4/metacluster_0/pattern_2	152.2931	0.0188	0.5257	0.0050	0.0090	0.0061	0.0577	NaN	NaN	NaN	428.6114	403.7701	0.0323	1.5600	0.0051	0.0278	0.0145	0.2506	NaN	NaN	NaN	870.4996	201.9634	0.0427	0.9988	0.0051	0.0336	0.0146	0.0299	2.0021	118.7279	True	502.7864	60.7334	0.0370	0.3309	0.0050	0.0227	0.0087	0.0897	NaN	NaN	NaN	170.4241	ATATGCAAAT	1.1637	2	1297	m0_p2	Oct4
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
Klf4/metacluster_0/pattern_10	230.7921	0.1216	1.5140	0.0052	0.0137	0.0074	0.0327	2.0074	94.6669	True	516.2514	145.6011	0.3354	1.3511	0.0047	0.0024	0.0021	0.0252	NaN	NaN	NaN	493.3911	79.9438	0.1969	0.4972	0.0048	0.0033	0.0011	0.0656	NaN	NaN	NaN	284.4637	31.9663	0.1982	0.2612	0.0049	0.0025	0.0008	0.0684	NaN	NaN	NaN	126.3464	AAACCCACACCCACAGTGACA...	0.9222	10	179	m0_p10	Klf4
Klf4/metacluster_0/pattern_11	221.7812	0.1008	1.1219	0.0053	0.0254	0.0103	0.0352	1.9735	192.2644	True	627.0230	93.6062	0.1084	0.7281	0.0051	0.0111	0.0041	0.0590	NaN	NaN	NaN	346.6437	1250.4874	0.3764	8.1375	0.0067	0.0500	0.0410	0.0340	NaN	NaN	NaN	1931.6782	75.0000	0.1785	0.4625	0.0057	0.0246	0.0120	0.0435	NaN	NaN	NaN	197.9253	CGAGAGGTCCCGGGTTCGAAC...	0.5855	11	174	m0_p11	Klf4
Klf4/metacluster_0/pattern_12	197.2323	0.2965	1.7121	0.0062	0.0085	0.0046	0.1084	1.9143	167.7523	True	508.1359	187.1414	0.2322	1.5707	0.0046	0.0055	0.0035	0.2612	NaN	NaN	NaN	566.1359	86.7273	0.1879	0.5404	0.0048	0.0043	0.0016	0.0469	NaN	NaN	NaN	327.8350	32.5253	0.2590	0.2172	0.0046	0.0038	0.0016	0.1403	NaN	NaN	NaN	133.4757	CCTAAAAAAAGCGTCAAAGAA...	0.5912	12	103	m0_p12	Klf4

	name	task	n_seqlets	pattern	pattern_id
0	Oct4-Sox2	Oct4	10742	m0_p0	0
1	Oct4	Oct4	1517	m0_p1	1
2	Oct4-Oct4	Oct4	571	m0_p6	6
3	Sox2	Sox2	2009	m0_p1	1
4	Nanog	Nanog	2981	m0_p1	1

	TF	all_seqlets	clustered_seqlets	%clustered	patterns
0	Oct4	43216	19714	45.6	19
1	Sox2	17735	7255	40.9	10
2	Nanog	101377	13463	13.3	19
3	Klf4	104209	16180	15.5	13