In [2]:
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'

Figure 5

Three things to show for the long motifs

  • % Match to the repbase
    • do the same plot as before
  • merge significantly similar motifs into one
    • merge also the seqlets to make the meta-plots?
  • show the importance scores within each motif

TODO

  • [X] get the full pattern size
  • [X] export the TE annotations to a google sheet
    • add also a short path name
    • export to tsv or csv to figures
  • [X] dump the detailed plots into each figure
    • separate by the file-names by underscore instead of slash and shorten the path
  • [x] write the TE <-> seq_info_content table to a file
    • additional column = Repbase TE + overlap fraction
  • [X] assemble the results into a figure
  • [X] for the profile plots, try having the TE <-> importance tracks (similar to figure 3)
    • [X] color the strands correspondingly for each TF
    • [X] invert the signal on the negative strand
  • [X] Get sequence histograms with wider context (100bp)
    • [X] Get absolute seqlet positions -> query the fasta file
In [154]:
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
from basepair.plot.profiles import multiple_plot_stranded_profile
from basepair.plot.profiles import multiple_plot_stranded_profile
from basepair.exp.paper.config import *
from basepair.plot.table import render_mpl_table
from basepair.exp.paper.fig4 import *
from basepair.plot.tracks import tidy_motif_plot

paper_config()
In [155]:
fdir = Path(f"{ddir}/figures/modisco/{exp}/long-motifs")
fdir.mkdir(exist_ok=True)
fdir_individual = fdir / 'individual'
fdir_individual.mkdir(exist_ok=True)
In [156]:
model_dir = models_dir / exp
In [157]:
mr_dict = OrderedDict([(task, ModiscoResult(model_dir / f"deeplift/{task}/out/{imp_score}/modisco.h5"))
           for task in tasks])
for k,v in mr_dict.items(): v.open()
patterns_dict = [(task, read_pkl(model_dir / f"deeplift/{task}/out/{imp_score}/patterns.pkl"))
                  for task in tasks]
footprints = {t: read_pkl(model_dir / f"deeplift/{t}/out/{imp_score}/footprints.pkl")
              for t in tasks}
In [ ]:
# Load the importance score files
isf = ImpScoreFile(model_dir / 'deeplift.imp_score.h5', default_imp_score=imp_score)
isf.cache()  # load all the regions

Implement enrich seqlets

In [ ]:
unclustered_patterns_dict = [(task, read_pkl(model_dir / f"deeplift/{task}/out/{imp_score}/unclustered.patterns.pkl"))
                             for task in tasks]
In [ ]:
def get_seqlets_bedtool(p):
    task = p.attrs['TF']
    return BedTool(str(model_dir / f"deeplift/{task}/out/{imp_score}/seqlets/{p.name}.bed.gz"))
In [ ]:
# re-order the seqlets
out = []
for task, ipatterns in unclustered_patterns_dict:
    mr = mr_dict[task]
    for p in ipatterns:
        n_seqlets = p.attrs['n_seqlets']
        sti = p.attrs['stacked_seqlet_imp']
        sti.dfi = mr.get_seqlet_intervals(p.name, as_df=True)
        o = (sti.aggregate().
               add_attr('TF', task).
               add_attr("n_seqlets", n_seqlets).
               add_attr("stacked_seqlet_imp", sti).
               add_attr("features", {"n seqlets": n_seqlets}))
        o.name = p.name
        out.append(o)

patterns2 = [p for p in out if p.attrs['n_seqlets'] > 100]
patterns_long_name = [p.copy() for p in patterns2]
# update the names
for p in patterns_long_name:
    p.name = p.attrs['TF'] + '/' + p.name
patterns_by_name = {p.name: p for p in patterns_long_name}
In [ ]:
patterns = [p for p in patterns_long_name if p.seq_info_content > 4]
In [ ]:
len(patterns)
In [ ]:
p = patterns[0]

Overlapping seqlet regions with RepeatMasker regions

Load the repeat masker file

In [ ]:
from basepair.exp.paper.repeat_masker import download_repeat_masker, read_repeat_masker, intersect_repeat_masker
from pybedtools import BedTool
In [ ]:
dfrm = read_repeat_masker(download_repeat_masker())

Intersect seqlet locations with repeat masker

In [ ]:
from basepair.modisco.core import patterns_to_df
In [ ]:
bt_te = BedTool.from_dataframe(dfrm)
In [ ]:
dfi_raw = pd.concat(Parallel(n_jobs=20)(delayed(intersect_repeat_masker)(p.attrs['TF'] + "/" + p.name, get_seqlets_bedtool(p), bt_te, f=0.9) for p in tqdm(patterns2)))
In [ ]:
dfi = dfi_raw.copy()
In [ ]:
# Restrict only to LTR/
# dfi = dfi[dfi.repeat_family.str.startswith("LTR/")]
In [ ]:
# Append some stats
unique_elements = dfi.groupby(['pattern_name']).interval.nunique()
dfiu = dfi[['pattern_name', 'n_pattern']].drop_duplicates().set_index('pattern_name').join(unique_elements)
dfiu['LTR_overlap_frac'] = dfiu.interval / dfiu.n_pattern
dfi = pd.merge(dfi, dfiu.reset_index()[['pattern_name', 'LTR_overlap_frac']], on='pattern_name', how='left')
dfi = dfi.drop_duplicates()
In [ ]:
# Append properties
df = patterns_to_df(patterns_long_name, properties=['name', 'short_name', 'seq_info_content'])
df = df.set_index('name').join(dfiu).reset_index()

Scatter-plot: Seq IC ~ LTR_overlap_frac

In [ ]:
TE_min_seq_IC = 30
TE_min_LTR_Overlap_frac = 0.7
In [ ]:
paper_config()
In [21]:
height = get_figsize(0.25)[0]
fig = sns.jointplot("LTR_overlap_frac", "seq_info_content", marginal_kws=dict(bins=15), s=5, data=df, height=height);
fig.ax_joint.axvline(TE_min_LTR_Overlap_frac, linestyle='--', color='grey', alpha=.4);
# fig.ax_joint.axhline(TE_min_seq_IC, linestyle='--', color='grey', alpha=.4);
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/tight_layout.py:199: UserWarning: Tight layout not applied. tight_layout cannot make axes width small enough to accommodate all axes decorations
  warnings.warn('Tight layout not applied. '
In [24]:
fig.savefig(f"{fdir}/LTR_overlap_frac,seq-IC.scatter.pdf")
In [22]:
df['group'] = (df['seq_info_content'] > 30).map({False: 'short', True: 'long'})

Density plot as an alternative to scatter plot

In [26]:
plotnine.options.figure_size = get_figsize(0.25)
fig = (ggplot(aes(x='LTR_overlap_frac', color='group'), data=df) +
 geom_density() +
 scale_color_brewer(type='qual', palette=3) + 
 xlab("LTR overlap fraction") +
 theme_classic(base_size=10, base_family='Arial')
)
fig
Out[26]:
<ggplot: (-9223363280179665902)>
In [27]:
fig.save(f"{fdir}/LTR_overlap_frac.density.pdf")
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 1.712595 x 1.05838371 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/modisco/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/long-motifs/LTR_overlap_frac.density.pdf
  warn('Filename: {}'.format(filename))
In [28]:
# TE's are in the top right
# pattern_te_names = df[(df.seq_info_content > TE_min_seq_IC) & (df.LTR_overlap_frac > TE_min_LTR_Overlap_frac)].sort_values('LTR_overlap_frac', ascending=False).name.unique()
In [23]:
# TE's are in the top right
pattern_te_names = df[(df.seq_info_content > TE_min_seq_IC)].sort_values('LTR_overlap_frac', ascending=False).name.unique()
In [30]:
len(pattern_te_names)
Out[30]:
18

We can see that motif with high IC are frequently in the region of known TE's

In [31]:
dfi.repeat_name.value_counts()
Out[31]:
RLTR9E       2098
RLTR13D6     1493
RLTR9D       1015
             ... 
MER125          1
MERVL-int       1
UCON9           1
Name: repeat_name, Length: 886, dtype: int64
In [32]:
dfi.repeat_family.value_counts()
Out[32]:
LTR/ERVK           13985
LTR/ERVL-MaLR       3324
SINE/Alu            1828
                   ...  
DNA/TcMar-Pogo         1
DNA/hAT-Tip100?        1
SINE/tRNA-RTE          1
Name: repeat_family, Length: 54, dtype: int64

PFM vs CWM

In [146]:
dfp = pd.concat([pd.read_csv(f"{model_dir}/deeplift/{task}/out/{imp_score}/pattern_table.csv").assign(task=task)
                 for task in tasks])

dfp['pattern_name'] = dfp['task'] + '/' + dfp['pattern']
dfp['max_contrib'] = dfp[[f'{task} imp profile' for task in tasks]].max(axis=1)
dfp['sum_contrib'] = dfp[[f'{task} imp profile' for task in tasks]].sum(axis=1)

dfp['contrib'] = [row[f'{row.task} imp profile'] for i, row in dfp.iterrows()]
dfp['length'] = dfp.consensus.str.len()
dfp['total_contrib'] = dfp['contrib'] * dfp['length']
dfp['IC'] = dfp['length'] * dfp['ic pwm mean']
dfp['total_sum_contrib'] = dfp[[f'{task} imp profile' for task in tasks]].sum(axis=1) * dfp['length']

# Main motifs
dfp = dfp[dfp['IC'] > 4]
dfp = dfp[dfp['n seqlets'] > 100]
In [50]:
# dfp['max_contrib_count'] = dfp[[f'{task} imp profile' for task in tasks]].max(axis=1)
# dfp['contrib_count'] = [row[f'{row.task} imp profile'] for i, row in dfp.iterrows()]
In [51]:
dfp.head()
Out[51]:
Klf4 footprint counts Klf4 footprint entropydiff Klf4 footprint max Klf4 footprint strandcor Klf4 imp counts Klf4 imp profile Klf4 periodicity 10bp Klf4 pos meandiff Klf4 pos std Klf4 pos unimodal Klf4 region counts Nanog footprint counts Nanog footprint entropydiff Nanog footprint max Nanog footprint strandcor Nanog imp counts Nanog imp profile Nanog periodicity 10bp Nanog pos meandiff Nanog pos std Nanog pos unimodal Nanog region counts Oct4 footprint counts Oct4 footprint entropydiff Oct4 footprint max Oct4 footprint strandcor Oct4 imp counts Oct4 imp profile Oct4 periodicity 10bp Oct4 pos meandiff Oct4 pos std Oct4 pos unimodal Oct4 region counts Sox2 footprint counts Sox2 footprint entropydiff Sox2 footprint max Sox2 footprint strandcor Sox2 imp counts Sox2 imp profile Sox2 periodicity 10bp Sox2 pos meandiff Sox2 pos std Sox2 pos unimodal Sox2 region counts consensus ic pwm mean idx n seqlets pattern task pattern_name max_contrib length IC contrib total_contrib max_contrib_count contrib_count
0 143.6074 0.0257 0.5459 0.0051 0.0203 0.0105 0.0686 NaN NaN NaN 402.9995 473.8224 0.0760 2.3823 0.0053 0.0538 0.0236 0.3644 NaN NaN NaN 956.8774 263.7851 0.1400 2.9668 0.0057 0.0672 0.0255 0.0679 1.9860 121.6666 True 572.4199 88.1211 0.1908 1.1994 0.0060 0.0480 0.0194 0.1548 NaN NaN NaN 205.1461 CCTTTGTTATGCAAAT 0.8710 0 10742 m0_p0 Oct4 Oct4/m0_p0 0.0255 16 13.9357 0.0255 0.4078 0.0255 0.0255
1 149.8835 0.0166 0.5083 0.0050 0.0076 0.0053 0.1108 NaN NaN NaN 429.0092 364.6005 0.0323 1.3482 0.0050 0.0212 0.0110 0.1445 NaN NaN NaN 822.5327 184.1158 0.0282 0.7830 0.0051 0.0301 0.0136 0.0723 1.9790 122.8195 True 472.7943 53.2370 0.0244 0.2550 0.0050 0.0186 0.0068 0.1040 NaN NaN NaN 159.0560 TTATGCAAAT 1.0574 1 1517 m0_p1 Oct4 Oct4/m0_p1 0.0136 10 10.5739 0.0136 0.1359 0.0136 0.0136
2 152.2931 0.0188 0.5257 0.0050 0.0090 0.0061 0.0577 NaN NaN NaN 428.6114 403.7701 0.0323 1.5600 0.0051 0.0278 0.0145 0.2506 NaN NaN NaN 870.4996 201.9634 0.0427 0.9988 0.0051 0.0336 0.0146 0.0299 2.0021 118.7279 True 502.7864 60.7334 0.0370 0.3309 0.0050 0.0227 0.0087 0.0897 NaN NaN NaN 170.4241 ATATGCAAAT 1.1637 2 1297 m0_p2 Oct4 Oct4/m0_p2 0.0146 10 11.6365 0.0146 0.1459 0.0146 0.0146
3 240.1971 0.0313 1.0273 0.0051 0.0121 0.0074 0.0540 NaN NaN NaN 570.9031 1102.3789 0.1304 6.9842 0.0054 0.0502 0.0264 0.4378 NaN NaN NaN 1782.2529 240.9923 0.0220 0.9737 0.0050 0.0267 0.0073 0.0769 1.9750 106.9080 True 554.0219 152.1856 0.1517 1.9368 0.0058 0.0291 0.0174 0.2354 NaN NaN NaN 295.8698 CCTTTGTTCT 0.9832 3 1052 m0_p3 Oct4 Oct4/m0_p3 0.0264 10 9.8321 0.0073 0.0730 0.0264 0.0073
4 347.7702 0.1958 3.1925 0.0059 0.0673 0.0439 0.0247 NaN NaN NaN 738.7773 577.4963 0.0324 2.5325 0.0050 0.0152 0.0110 0.1467 NaN NaN NaN 1245.0659 197.0388 0.0409 1.0194 0.0049 0.0202 0.0068 0.0381 1.9809 176.7330 True 526.9279 75.3001 0.0351 0.3578 0.0050 0.0174 0.0061 0.0606 NaN NaN NaN 202.1866 GCCACACCCAG 1.0765 4 970 m0_p4 Oct4 Oct4/m0_p4 0.0439 11 11.8415 0.0068 0.0747 0.0439 0.0068
In [84]:
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='ic pwm mean', y='max_contrib', color='IC'), dfp) + 
 geom_point() + 
 theme_classic()
)
Out[84]:
<ggplot: (-9223363278448453979)>
In [85]:
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='ic pwm mean', y='max_contrib', color='length'), dfp) + 
 geom_point() + 
 theme_classic()
)
Out[85]:
<ggplot: (-9223363278448472632)>
In [86]:
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='ic pwm mean', y='max_contrib', color='length'), dfp) + 
 geom_point() + 
 theme_classic()
)
Out[86]:
<ggplot: (8758406225777)>
In [87]:
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='ic pwm mean', y='contrib', color='length'), dfp) + 
 geom_point() + 
 theme_classic()
)
Out[87]:
<ggplot: (-9223363278448605289)>
In [88]:
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='IC', y='max_contrib', color='length'), dfp) + 
 geom_point() + 
 theme_classic()
)
Out[88]:
<ggplot: (8758406225770)>
In [89]:
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='IC', y='length', color='length'), dfp) + 
 geom_point() + 
 theme_classic()
)
Out[89]:
<ggplot: (8758406128802)>
In [90]:
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='IC', y='contrib', color='length'), dfp) + 
 geom_point() + 
 theme_classic()
)
Out[90]:
<ggplot: (-9223363278448711876)>
In [131]:
from adjustText import adjust_text
In [140]:
plotnine.options.figure_size = get_figsize(1.5)
f = (ggplot(aes(x='IC', y='contrib', color='length', label='pattern_name'), dfp) + 
 geom_point() + 
 geom_text(size=7) + 
 theme_classic()
)
f = f.draw()
for a in f.axes:
    texts = [t for t in  a.texts]
    adjust_text(texts,ax=a, arrowprops=dict(arrowstyle='-', color='grey'))
In [147]:
plotnine.options.figure_size = get_figsize(1.5)
f = (ggplot(aes(x='IC', y='sum_contrib', color='length', label='pattern_name'), dfp) + 
 geom_point() + 
 geom_text(size=7) + 
 theme_classic()
)
f = f.draw()
for a in f.axes:
    texts = [t for t in  a.texts]
    adjust_text(texts,ax=a, arrowprops=dict(arrowstyle='-', color='grey'))
In [150]:
plotnine.options.figure_size = get_figsize(1.5)
f = (ggplot(aes(x='ic pwm mean', y='sum_contrib', color='length', label='pattern_name'), dfp) + 
 geom_point() + 
 geom_text(size=7) + 
 theme_classic() + 
 ylab("Average contrib")
)
f = f.draw()
for a in f.axes:
    texts = [t for t in  a.texts]
    adjust_text(texts,ax=a, arrowprops=dict(arrowstyle='-', color='grey'))
In [148]:
plotnine.options.figure_size = get_figsize(1.5)
f = (ggplot(aes(x='IC', y='total_sum_contrib', color='length', label='pattern_name'), dfp) + 
 geom_point() + 
 geom_text(size=7) + 
 theme_classic()
)
f = f.draw()
for a in f.axes:
    texts = [t for t in  a.texts]
    adjust_text(texts,ax=a, arrowprops=dict(arrowstyle='-', color='grey'))
In [152]:
plotnine.options.figure_size = get_figsize(1.5)
f = (ggplot(aes(x='IC', y='total_contrib', color='length', label='pattern_name'), dfp) + 
 geom_point() + 
 geom_text(size=7) + 
 theme_classic()
)
f = f.draw()
for a in f.axes:
    texts = [t for t in  a.texts]
    adjust_text(texts,ax=a, arrowprops=dict(arrowstyle='-', color='grey'))
In [134]:
import hvplot.pandas
In [139]:
dfp.hvplot(x='IC', y='contrib', kind='scatter', value_label='pattern_name', alpha=0.7, persist=False)
Out[139]:
In [91]:
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='IC', y='total_contrib', color='length'), dfp) + 
 geom_point() + 
 theme_classic()
)
Out[91]:
<ggplot: (8758406094898)>
In [123]:
from scipy.stats import spearmanr
In [124]:
dfp2 = pd.DataFrame([{"contrib": p.contrib[p.attrs['TF']].sum(), 
                      "IC": p.seq_info_content, 
                      "name": p.name, 
                      "corr": spearmanr(p._get_track('seq_ic').ravel(), p.contrib[p.attrs['TF']].ravel())[0],
                      "log_n_seqlets": np.log10(p.attrs['n_seqlets']), 
                      'is_te': p.seq_info_content > 30} 
                     for p in patterns])
In [125]:
dfp2['contrib-per-IC'] = dfp2['contrib'] / dfp2['IC']
In [126]:
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='IC', y='contrib', color='is_te'), dfp2) + 
 geom_point() + 
 theme_classic()
)
Out[126]:
<ggplot: (8758406212854)>
In [127]:
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='corr', y='contrib-per-IC', color='is_te'), dfp2) + 
 geom_point() + 
 theme_classic()
)
Out[127]:
<ggplot: (8758406454045)>
In [128]:
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='IC', y='contrib', color='is_te'), dfp2) + 
 geom_point() + 
 theme_classic()
)
Out[128]:
<ggplot: (-9223363278447703450)>
In [93]:
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='IC', y='contrib', color='log_n_seqlets'), dfp2) + 
 geom_point() + 
 theme_classic()
)
Out[93]:
<ggplot: (-9223363278448444942)>
In [14]:
dfp.head()
Out[14]:
Klf4 footprint counts Klf4 footprint entropydiff Klf4 footprint max Klf4 footprint strandcor Klf4 imp counts Klf4 imp profile Klf4 periodicity 10bp Klf4 pos meandiff Klf4 pos std Klf4 pos unimodal Klf4 region counts Nanog footprint counts Nanog footprint entropydiff Nanog footprint max Nanog footprint strandcor Nanog imp counts Nanog imp profile Nanog periodicity 10bp Nanog pos meandiff Nanog pos std Nanog pos unimodal Nanog region counts Oct4 footprint counts Oct4 footprint entropydiff Oct4 footprint max Oct4 footprint strandcor Oct4 imp counts Oct4 imp profile Oct4 periodicity 10bp Oct4 pos meandiff Oct4 pos std Oct4 pos unimodal Oct4 region counts Sox2 footprint counts Sox2 footprint entropydiff Sox2 footprint max Sox2 footprint strandcor Sox2 imp counts Sox2 imp profile Sox2 periodicity 10bp Sox2 pos meandiff Sox2 pos std Sox2 pos unimodal Sox2 region counts consensus ic pwm mean idx n seqlets pattern task
0 143.6074 0.0257 0.5459 0.0051 0.0203 0.0105 0.0686 NaN NaN NaN 402.9995 473.8224 0.0760 2.3823 0.0053 0.0538 0.0236 0.3644 NaN NaN NaN 956.8774 263.7851 0.1400 2.9668 0.0057 0.0672 0.0255 0.0679 1.9860 121.6666 True 572.4199 88.1211 0.1908 1.1994 0.0060 0.0480 0.0194 0.1548 NaN NaN NaN 205.1461 CCTTTGTTATGCAAAT 0.8710 0 10742 m0_p0 Oct4
1 149.8835 0.0166 0.5083 0.0050 0.0076 0.0053 0.1108 NaN NaN NaN 429.0092 364.6005 0.0323 1.3482 0.0050 0.0212 0.0110 0.1445 NaN NaN NaN 822.5327 184.1158 0.0282 0.7830 0.0051 0.0301 0.0136 0.0723 1.9790 122.8195 True 472.7943 53.2370 0.0244 0.2550 0.0050 0.0186 0.0068 0.1040 NaN NaN NaN 159.0560 TTATGCAAAT 1.0574 1 1517 m0_p1 Oct4
2 152.2931 0.0188 0.5257 0.0050 0.0090 0.0061 0.0577 NaN NaN NaN 428.6114 403.7701 0.0323 1.5600 0.0051 0.0278 0.0145 0.2506 NaN NaN NaN 870.4996 201.9634 0.0427 0.9988 0.0051 0.0336 0.0146 0.0299 2.0021 118.7279 True 502.7864 60.7334 0.0370 0.3309 0.0050 0.0227 0.0087 0.0897 NaN NaN NaN 170.4241 ATATGCAAAT 1.1637 2 1297 m0_p2 Oct4
3 240.1971 0.0313 1.0273 0.0051 0.0121 0.0074 0.0540 NaN NaN NaN 570.9031 1102.3789 0.1304 6.9842 0.0054 0.0502 0.0264 0.4378 NaN NaN NaN 1782.2529 240.9923 0.0220 0.9737 0.0050 0.0267 0.0073 0.0769 1.9750 106.9080 True 554.0219 152.1856 0.1517 1.9368 0.0058 0.0291 0.0174 0.2354 NaN NaN NaN 295.8698 CCTTTGTTCT 0.9832 3 1052 m0_p3 Oct4
4 347.7702 0.1958 3.1925 0.0059 0.0673 0.0439 0.0247 NaN NaN NaN 738.7773 577.4963 0.0324 2.5325 0.0050 0.0152 0.0110 0.1467 NaN NaN NaN 1245.0659 197.0388 0.0409 1.0194 0.0049 0.0202 0.0068 0.0381 1.9809 176.7330 True 526.9279 75.3001 0.0351 0.3578 0.0050 0.0174 0.0061 0.0606 NaN NaN NaN 202.1866 GCCACACCCAG 1.0765 4 970 m0_p4 Oct4

For each pattern, are all seqlets falling into regions of the same LTRs?

In [203]:
dfi_top = dfi.drop_duplicates()
dfi_top['n_repeat_name'] = dfi_top.groupby(['pattern_name', 'repeat_name']).repeat_name.transform('size')
dfi_top['n_repeat_frac'] = dfi_top['n_repeat_name'] / dfi_top['n_pattern']
dfi_top1 = dfi_top.groupby(['pattern_name']).apply(lambda x: x.loc[x.n_repeat_frac.idxmax()])
In [204]:
del dfi_top1['chrom']
del dfi_top1['start']
del dfi_top1['end']
del dfi_top1['interval']
del dfi_top1['pattern_name']
In [205]:
len(dfi_top1)
Out[205]:
56

Long patterns and their LTR overlap

In [206]:
long_patterns = [p for p in patterns_long_name if p.seq_info_content >= 30]
long_patterns_by_name = {p.name: p for p in long_patterns}
In [207]:
long_patterns_clustered = cluster_align_patterns(long_patterns, n_clusters=len(patterns_long_name)//2, trials=20, max_shift=20)
100%|██████████| 18/18 [00:01<00:00, 10.00it/s]
100%|██████████| 18/18 [00:00<00:00, 129.40it/s]

Plot long patterns

In [208]:
df_cp = pd.read_excel(f"{fdir}/../overlap_cwm_pwm.xlsx")
In [209]:
df_cp.is_te == 1
Out[209]:
0      True
1      True
2      True
      ...  
59    False
60    False
61    False
Name: is_te, Length: 62, dtype: bool
In [210]:
df_cp['pattern_name'] = df_cp['task'] + "/" + df_cp['pattern']
df_cp.set_index('pattern_name', inplace=True)
In [211]:
dfp = df_cp.loc[[p.name for p in long_patterns_clustered]]
In [212]:
print(dfp[['n_pwm_gw', 'n_pwm_peaks', 'n_cwm_peaks', 'n_cwm_and_pwm_peaks', 'n_seqlets']].to_string())
                           n_pwm_gw  n_pwm_peaks  n_cwm_peaks  n_cwm_and_pwm_peaks  n_seqlets
pattern_name                                                                                 
Oct4/metacluster_0/pat...   76989.0       1446.0        914.0                309.0      355.0
Nanog/metacluster_0/pa...   67726.0       2710.0        709.0                640.0      571.0
Sox2/metacluster_0/pat...   73617.0       1288.0       1045.0                502.0      148.0
Nanog/metacluster_0/pa...   86528.0       2286.0        236.0                219.0      203.0
Oct4/metacluster_0/pat...   94727.0       1179.0        138.0                 97.0      138.0
Nanog/metacluster_0/pa...   61720.0       1902.0        187.0                171.0      129.0
Oct4/metacluster_0/pat...   93066.0       1205.0        145.0                 98.0      142.0
Sox2/metacluster_0/pat...   64904.0        550.0       1731.0                217.0      104.0
Nanog/metacluster_0/pa...   65114.0       1158.0        235.0                236.0      300.0
Nanog/metacluster_0/pa...   95160.0       2115.0        129.0                124.0      127.0
Nanog/metacluster_0/pa...   54448.0       2337.0        171.0                170.0      152.0
Nanog/metacluster_0/pa...   93401.0       2973.0        306.0                297.0      134.0
Nanog/metacluster_0/pa...   50691.0       1342.0        269.0                233.0      155.0
Nanog/metacluster_0/pa...   88149.0       3373.0        197.0                198.0      138.0
Klf4/metacluster_0/pat...   59346.0       1287.0        448.0                442.0      334.0
Klf4/metacluster_0/pat...   62180.0       2157.0        512.0                147.0      103.0
Klf4/metacluster_0/pat...   79361.0       3530.0       1797.0               1212.0      179.0
Oct4/metacluster_0/pat...   90990.0       1538.0        541.0                373.0      125.0
In [213]:
df_info = get_df_info(long_patterns_clustered, tasks)
In [ ]:
df_info.head()
In [216]:
dft = dfi_top1.loc[[p.name for p in long_patterns_clustered]]
In [219]:
print(dft.to_string())
                           n_pattern     repeat_name  repeat_family  LTR_overlap_frac  n_repeat_name  n_repeat_frac
pattern_name                                                                                                       
Oct4/metacluster_0/pat...        353          RLTR9E       LTR/ERVK            0.9065            139         0.3938
Nanog/metacluster_0/pa...        570          RLTR9E       LTR/ERVK            0.9316            251         0.4404
Sox2/metacluster_0/pat...        148        RLTR13D6       LTR/ERVK            0.9257             66         0.4459
Nanog/metacluster_0/pa...        203  MMERVK9C_I-int       LTR/ERVK            0.8966             87         0.4286
Oct4/metacluster_0/pat...        138  MMERVK9C_I-int       LTR/ERVK            0.8188             56         0.4058
Nanog/metacluster_0/pa...        129          RLTR17       LTR/ERVK            0.9147             60         0.4651
Oct4/metacluster_0/pat...        140         RLTR9A3       LTR/ERVK            0.9429             83         0.5929
Sox2/metacluster_0/pat...        104         RMER10A       LTR/ERVL            0.7885             24         0.2308
Nanog/metacluster_0/pa...        300        RLTR13D6       LTR/ERVK            0.8100            181         0.6033
Nanog/metacluster_0/pa...        127         RLTR13G       LTR/ERVK            0.7638             49         0.3858
Nanog/metacluster_0/pa...        152        RLTR13D6       LTR/ERVK            0.8816            103         0.6776
Nanog/metacluster_0/pa...        134        RLTR13D6       LTR/ERVK            0.9104             47         0.3507
Nanog/metacluster_0/pa...        155           BGLII       LTR/ERVK            0.8387             75         0.4839
Nanog/metacluster_0/pa...        138        RLTR13D6       LTR/ERVK            0.9783            101         0.7319
Klf4/metacluster_0/pat...        331        RLTR13B1       LTR/ERVK            0.9728            122         0.3686
Klf4/metacluster_0/pat...        103             MTD  LTR/ERVL-MaLR            0.0194              1         0.0097
Klf4/metacluster_0/pat...        179          ORR1A2  LTR/ERVL-MaLR            0.8659             39         0.2179
Oct4/metacluster_0/pat...        125       (TTAGGG)n  Simple_repeat            0.8080             22         0.1760
In [206]:
# clustered version of the plot
fig, axes = plt.subplots(len(long_patterns), 1, figsize=get_figsize(.5, aspect=1.5))
                         # gridspec_kw=dict(wspace=10))
# fig, axes = plt.subplots(2, 7, figsize=get_figsize(2, aspect=1/2))
# fig.subplots_adjust(hspace=0, wspace=2)

for i, p in enumerate(long_patterns_clustered):
    #p = pl[pname]
     
#     if p.name in main_patterns:
#         continue
    # Motif logo
    
    ltr_name = dft.loc[p.name].repeat_name
    ltr_frac = dft.loc[p.name].LTR_overlap_frac
    
    ax = axes[i]
    # Text columns before
    seqlogo(p.get_seq_ic(), ax=ax)
    ax.set_ylim([0, 2])  # all plots have the same shape
    strip_axis(ax)
    ax.axison = False
    if i == 0:
        ax.set_title("Sequence information content")

    task, mc, pl = p.name.split("/")
    pname = task + "/" + pl.split('_')[1]
    
    # text in before of the motif
    ax.text(-25, 0, pname, fontsize=8, horizontalalignment='right')
    ax.text(-15, 0, str(p.attrs['n_seqlets']), fontsize=8, horizontalalignment='right')
    ax.text(-5, 0, int(dfp.loc[p.name].n_cwm_peaks), fontsize=8, horizontalalignment='right')
    
    # text after the motif
    ax.text(75, 0, f"{ltr_frac:.2f}", fontsize=8, horizontalalignment='left')
    ax.text(85, 0, ltr_name, fontsize=8, horizontalalignment='left')
fig.savefig(f'{fdir}/pattern-table.all.pdf')
In [30]:
main_patterns = dft.groupby("repeat_name").LTR_overlap_frac.idxmax().values
In [40]:
dft.reset_index().to_csv(f"{fdir}/long_patterns.tsv", index=False, sep='\t')
In [41]:
dfi_top1
Out[41]:
n_pattern repeat_name repeat_family LTR_overlap_frac n_repeat_name n_repeat_frac
pattern_name
Klf4/metacluster_0/pattern_0 9134 RLTR9E LTR/ERVK 0.4940 619 0.0678
Klf4/metacluster_0/pattern_1 1988 RLTR13D6 LTR/ERVK 0.4145 68 0.0342
Klf4/metacluster_0/pattern_10 179 ORR1A2 LTR/ERVL-MaLR 0.8659 39 0.2179
... ... ... ... ... ... ...
Sox2/metacluster_0/pattern_5 132 ORR1C1 LTR/ERVL-MaLR 0.6136 25 0.1894
Sox2/metacluster_0/pattern_6 104 RMER10A LTR/ERVL 0.7885 24 0.2308
Sox2/metacluster_0/pattern_8 148 RLTR13D6 LTR/ERVK 0.9257 66 0.4459

56 rows × 6 columns

In [42]:
dfi_top1.LTR_overlap_frac.plot.hist(30);
In [43]:
dfi_top1.reset_index().to_excel(f"{fdir}/../repeat-masker-overlap.xlsx", index=False)
In [44]:
dfi_top1
Out[44]:
n_pattern repeat_name repeat_family LTR_overlap_frac n_repeat_name n_repeat_frac
pattern_name
Klf4/metacluster_0/pattern_0 9134 RLTR9E LTR/ERVK 0.4940 619 0.0678
Klf4/metacluster_0/pattern_1 1988 RLTR13D6 LTR/ERVK 0.4145 68 0.0342
Klf4/metacluster_0/pattern_10 179 ORR1A2 LTR/ERVL-MaLR 0.8659 39 0.2179
... ... ... ... ... ... ...
Sox2/metacluster_0/pattern_5 132 ORR1C1 LTR/ERVL-MaLR 0.6136 25 0.1894
Sox2/metacluster_0/pattern_6 104 RMER10A LTR/ERVL 0.7885 24 0.2308
Sox2/metacluster_0/pattern_8 148 RLTR13D6 LTR/ERVK 0.9257 66 0.4459

56 rows × 6 columns

In [45]:
p = long_patterns[0]
In [46]:
p.name
Out[46]:
'Oct4/metacluster_0/pattern_9'
In [47]:
def shorten_pname(name):
    """'Oct4/metacluster_0/pattern_9' -> Oct4.p0
    """
    tf, mc, pn = name.split("/")
    pn_id = pn.replace("pattern_", "")
    return f"{tf}.p{pn_id}"
In [48]:
for p in long_patterns:
    ymax = max([v.max() for v in p.contrib.values()])
    ymin = min([v.min() for v in p.contrib.values()])
    ltr_name = dft.loc[p.name].repeat_name
    ltr_frac = dft.loc[p.name].LTR_overlap_frac

    # Plot the other factor
    long_patterns_by_name[p.name].plot(['seq_ic', 'contrib'], 
           rotate_y=0, 
           letter_width=0.1, 
           height=0.4,
           ylim=[(0, 2)] + [(ymin, ymax)]*4,
           title=f"{p.name},  LTR: {ltr_name}, any LTR frac: {ltr_frac:.2f}");
    sns.despine(top=True, right=True, left=False, bottom=True)
    out_name = shorten_pname(p.name) + "." + ltr_name
    plt.savefig(f"{fdir_individual}/{out_name}.contrib.pdf")

Sequence heatmap

In [ ]:
import collections
from basepair.plot.tracks import *

def plot_track(arr, ax, legend=False, ylim=None, color=None):
    """Plot a track
    """
    seqlen = len(arr)
    if arr.ndim == 1 or arr.shape[1] == 1:
        # single track
        if color is not None:
            if isinstance(color, collections.Sequence):
                color = color[0]
        ax.plot(np.arange(1, seqlen + 1), np.ravel(arr), color=color)
    elif arr.shape[1] == 4:
        # plot seqlogo
        seqlogo(arr, ax=ax)
    elif arr.shape[1] == 2:
        # plot both strands
        if color is not None:
            assert isinstance(color, collections.Sequence)
            c1 = color[0]
            c2 = color[1]
        else:
            c1, c2 = None, None
        ax.plot(np.arange(1, seqlen + 1), arr[:, 0], label='pos', color=c1)
        ax.plot(np.arange(1, seqlen + 1), arr[:, 1], label='neg', color=c2)
        if legend:
            ax.legend()
    else:
        raise ValueError(f"Don't know how to plot array with shape[1] != {arr.shape[1]}. Valid values are: 1,2 or 4.")
    if ylim is not None:
        ax.set_ylim(ylim)

def plot_tracks(tracks, seqlets=[], title=None, rotate_y=90, legend=False, fig_width=20,
                fig_height_per_track=2, ylim=None, same_ylim=False, ylab=True, color=None):
    """Plot a multiple tracks.

    One-hot-encoded sequence as a logo,
    and 1 or 2 dim tracks as normal line-plots.

    Args:
      tracks: dictionary of numpy arrays with the same axis0 length
      fig_width: figure width
      fig_height_per_track: figure height per track.
      ylim: if not None, a single tuple or a list of tuples representing the ylim to use

    Returns:
      matplotlib.figure.Figure
    """
    
    tracks = skip_nan_tracks(tracks)  # ignore None values
    fig, axes = plt.subplots(len(tracks), 1,
                             figsize=(fig_width, fig_height_per_track * len(tracks)),
                             sharex=True)

    if len(tracks) == 1:
        axes = [axes]

    if same_ylim:
        ylim = (0, max([v.max() for k,v in get_items(tracks)]))
        
    for i, (ax, (track, arr)) in enumerate(zip(axes, get_items(tracks))):
        plot_track(arr, ax, legend, get_list_value(ylim, i), 
                   color=get_list_value(color, i))
        for seqlet in seqlets:
            plot_seqlet(seqlet, ax, add_label=i == 0)
        # ax.set_ylabel(track)
        if ylab:
            ax.set_ylabel(track, rotation=rotate_y,
                          multialignment='center',
                          ha='right', labelpad=5)
        simple_yaxis_format(ax)
        if i != len(tracks) - 1:
            ax.xaxis.set_ticks_position('none')
        if i == 0 and title is not None:
            ax.set_title(title)

        # if seqlets:
        #    pass

    # add ticks to the final axis
    ax.xaxis.set_major_locator(ticker.AutoLocator())
    # ax.xaxis.set_major_locator(ticker.MaxNLocator(4))
    # spaced_xticks(ax, spacing=5)
    fig.subplots_adjust(hspace=0)
    #fig.subplots_adjust(left=0, right=1, top=1, bottom=0)
    # cleanup the plot
    return fig


def get_tracks(p, tasks):

    def flip_neg(x):
        x = x.copy()
        if x.shape[1] == 2:
            x[:,1] = -x[:,1]
        return x

    tracks = OrderedDict([(kind, p._get_track(kind)) for kind in ['profile', 'seq_ic', 'contrib']])
    tracks = [('seq_ic', tracks['seq_ic'])] + [
        (task, flip_neg(tracks[track][task]))
        for task in tasks
        for track in ['profile', 'contrib']
    ]
    return tracks
In [ ]:
pmax_dict = {t: max([p.profile[t].max() for p in long_patterns_clustered
                    if p.name != 'Klf4/metacluster_0/pattern_18']) 
             for t in tasks}
ylim_profiles = {t: (0, pmax_dict[t]) for t in p.tasks()}
In [ ]:
ymax = max([v.max() for v in p.contrib.values()])
ymin = min([v.min() for v in p.contrib.values()])
In [ ]:
ylim_profiles
In [ ]:
# ylim=ylim_profiles + [(0, 2)] + [(ymin, ymax)]*4
letter_width=0.1
height=0.4
rotate_y=0
colors = [None]
for task in tasks:
    colors.append((tf_colors[task], tf_colors[task] + "80"))  # 80 add alpha=0.5
    colors.append(None)

Seqlogo + profile

In [54]:
pmax_dict = {t: max([p.profile[t].max() for p in long_patterns_clustered
                    if p.name != 'Oct4/metacluster_0/pattern_14']) 
             for t in tasks}
ylim_profiles = [(0, pmax_dict[t]) for t in p.tasks()]
for p in long_patterns:
    ymax = max([v.max() for v in p.contrib.values()])
    ymin = min([v.min() for v in p.contrib.values()])
    ltr_name = dft.loc[p.name].repeat_name
    ltr_frac = dft.loc[p.name].LTR_overlap_frac
    
    pymax = max([v.max() for v in p.profile.values()])
    pymin = min([v.min() for v in p.profile.values()])

    ylim_profiles = [(0, pmax_dict[t]) for t in p.tasks()]
    
    tracks = get_tracks(long_patterns_by_name[p.name], tasks)
    w,h = get_figsize(0.6, 1)
    plot_tracks(tracks,
               # title=self.name,
               rotate_y=rotate_y,
               fig_width=w,
               fig_height_per_track=h / 9,
               color=colors,
               ylim=[(0, 2)] + [(-pmax_dict[task], pmax_dict[task]) if which == 'profile' else (ymin, ymax)
                                for task in tasks for which in ['profile', 'imp']],
               # height_ratios=[1] + [x for i in range(len(tasks)) for x in [1.2, 0.8]],
               title=f"{p.name},  LTR: {ltr_name}, any LTR frac: {ltr_frac:.2f}",
               ylab=ylab);
    sns.despine(top=True, right=True, left=False, bottom=True)
    
    out_name = shorten_pname(p.name) + "." + ltr_name
    plt.savefig(f"{fdir_individual}/{out_name}.contrib-w-profiles.pdf")

Sequence heatmaps

In [ ]:
from basepair.plot.heatmaps import heatmap_sequence
from genomelake.extractors import FastaExtractor
from basepair.preproc import resize_interval, interval_center
from basepair.exp.te.dist import sort_seqs_kimura, consensus_dist_kimura

dfi_cols = ['chrom', 'start', 'end', 'interval_from_task', 'seqname', 'strand']
In [ ]:
# fe = FastaExtractor(fasta_file, use_strand=True)

# for p in long_patterns:
#     dfi = p.attrs['stacked_seqlet_imp'].dfi[dfi_cols]
#     seqs = fe([resize_interval(x, 200) for x in BedTool.from_dataframe(dfi)])
#     fig = heatmap_sequence(sort_seqs_kimura(seqs), aspect=0.5, figsize_tmpl=get_figsize(.5, 20), cbar=False)
    
#     # Add the title
#     ltr_name = dft.loc[p.name].repeat_name
#     ltr_frac = dft.loc[p.name].LTR_overlap_frac
#     plt.title(f"{p.name}, \nLTR: {ltr_name}, any LTR frac: {ltr_frac:.2f}");    
    
#     out_name = shorten_pname(p.name) + "." + ltr_name
#     plt.savefig(f"{fdir_individual}/{out_name}.seq.pdf")
#     plt.savefig(f"{fdir_individual}/{out_name}.seq.png")
In [ ]:
ranges = isf.get_ranges()
ranges.columns = ['example_' + c for c in ranges.columns]
In [ ]:
dfi = p.attrs['stacked_seqlet_imp'].dfi
In [72]:
profile_width = 200
for p in long_patterns:
    # Get LTR names
    ltr_name = dft.loc[p.name].repeat_name
    ltr_frac = dft.loc[p.name].LTR_overlap_frac
    
    # Extract stacked seqlets again
    dfi = p.attrs['stacked_seqlet_imp'].dfi.copy()
    # adopt dfi to the right format
    dfi = resize_interval(dfi, width=profile_width)
    dfi['center'] = interval_center(dfi)
    task = p.name.split("/")[0]
    ranges_subset = ranges[ranges.example_interval_from_task == task]
    ranges_subset['example_idx_new'] = np.arange(len(ranges_subset))
    dfi = pd.merge(dfi, ranges_subset, left_on='seqname', right_on='example_idx_new')
    assert np.all(dfi.example_chrom == dfi.chrom)
    assert np.all(dfi.center > dfi.example_start)
    assert np.all(dfi.center < dfi.example_end)
    dfi['pattern_center'] = interval_center(dfi) - dfi['example_start']
    dfi['pattern_start'] = dfi['start'] - dfi['example_start']
    dfi['pattern_end'] = dfi['end'] - dfi['example_start']
    simp = isf.extract_dfi(dfi, profile_width=profile_width)
    # Re-sort the sequence + profiles according to the kimura distance
    simp = simp[np.argsort(consensus_dist_kimura(simp.seq))]
    
    # Plot sequence
    fig = simp.plot("seq", aspect=0.5, figsize_tmpl=get_figsize(.5, 20), cbar=False)
    plt.title(f"{p.name}, \nLTR: {ltr_name}, any LTR frac: {ltr_frac:.2f}");    
    
    out_name = shorten_pname(p.name) + "." + ltr_name
    fig.savefig(f"{fdir_individual}/{out_name}.heatmap-seq.pdf", raster=True)
    
    # Plot profile
    fig = simp.plot("profile", figsize=get_figsize(2, 1))
    fig.savefig(f"{fdir_individual}/{out_name}.heatmap-profile.pdf", raster=True)
Removed 8/355 instances at the boundaries
Removed 2/138 instances at the boundaries
Removed 2/125 instances at the boundaries
Removed 5/142 instances at the boundaries
Removed 2/104 instances at the boundaries
Removed 1/148 instances at the boundaries
Removed 6/571 instances at the boundaries
Removed 3/300 instances at the boundaries
Removed 5/127 instances at the boundaries
Removed 6/334 instances at the boundaries
Removed 4/179 instances at the boundaries
Removed 13/103 instances at the boundaries
In [73]:
# Save the color-bar
from concise.preprocessing import encodeDNA

fig = heatmap_sequence(encodeDNA(["A"*100]*200), aspect=1, figsize_tmpl=get_figsize(1, 5), cbar=True);
fig.savefig(f"{fdir}/colorbar.pdf")

Visualizing the profiles for the 'main' pattern per LTR

In [38]:
pl = {p.name: p for p in patterns_long_name}
In [39]:
df_info = get_df_info(long_patterns_clustered, tasks)
In [40]:
p = long_patterns[0]
In [41]:
p.attrs['n_seqlets']
Out[41]:
355
In [42]:
long_patterns[0].name
Out[42]:
'Oct4/metacluster_0/pattern_9'
In [83]:
# fp_slice = slice(25, 175)
fp_slice = slice(10, 190)

max_vals = {t: df_info.max()[t + "_max"] for t in tasks}

fig, axes = plt.subplots(len(long_patterns_clustered), 8, figsize=get_figsize(1, aspect=1))
# fig, axes = plt.subplots(2, 7, figsize=get_figsize(2, aspect=1/2))
fig.subplots_adjust(hspace=0, wspace=0)

# Get the ylim for each TF
contrib_ylim = {tf: (min([p.contrib[p.attrs['TF']].min() for p in long_patterns_clustered if p.attrs['TF'] == tf] + [0]),
                     max([p.contrib[p.attrs['TF']].max() for p in long_patterns_clustered if p.attrs['TF'] == tf] + [0]))
               for tf in tasks}

for i, p in enumerate(long_patterns_clustered):
    #p = pl[pname]
     
#     if p.name in main_patterns:
#         continue
    # Motif logo
    
    ltr_name = dft.loc[p.name].repeat_name
    ltr_frac = dft.loc[p.name].LTR_overlap_frac
    
    
    ax = axes[i, 0]
    # Text columns before
    ax.text(-1, 0, p.attrs["TF"] + "\n" + str(p.attrs["n_seqlets"]), fontsize=8, horizontalalignment='right')
    seqlogo(p.get_seq_ic(), ax=ax)
    ax.set_ylim([0, 2])  # all plots have the same shape
    strip_axis(ax)
    ax.axison = False
    pos1 = ax.get_position() # get the original position 
    pos2 = [pos1.x0, pos1.y0 + pos1.height * 0.4,  pos1.width *3, pos1.height * .5]
    ax.set_position(pos2) # set a new position
    if i == 0:
        ax.set_title("Sequence\ninfo. content")

    # TOOD 
    ax = axes[i, 1]
    blank_ax(ax)
    ax = axes[i, 2]
    blank_ax(ax)
    ax = axes[i, 3]
    blank_ax(ax)
    ax.text(1, 0, ltr_name + f"\n{ltr_frac:.2f}", fontsize=8, horizontalalignment='right')
    
    # Profile columns
    for j, task in enumerate(tasks):
        ax = axes[i, 4 + j]
        fp = p.profile[task]
        
        ax.plot(fp[fp_slice, 0], color=tf_colors[task])
        ax.plot(-fp[fp_slice, 1], color=tf_colors[task], alpha=0.5)  # negative strand
        
        ax.set_ylim([-max_vals[task], max_vals[task]])  # all plots have the same shape
        strip_axis(ax)
        ax.axison = False
        
        if i == 0:
            ax.set_title(task)

fig.savefig(f'{fdir}/pattern-table.all.pdf')