exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'

Figure 5¶

Three things to show for the long motifs

% Match to the repbase
- do the same plot as before
merge significantly similar motifs into one
- merge also the seqlets to make the meta-plots?
show the importance scores within each motif

TODO¶

[X] get the full pattern size
[X] export the TE annotations to a google sheet
- add also a short path name
- export to tsv or csv to figures
[X] dump the detailed plots into each figure
- separate by the file-names by underscore instead of slash and shorten the path
[x] write the TE <-> seq_info_content table to a file
- additional column = Repbase TE + overlap fraction
[X] assemble the results into a figure
[X] for the profile plots, try having the TE <-> importance tracks (similar to figure 3)
- [X] color the strands correspondingly for each TF
- [X] invert the signal on the negative strand
[X] Get sequence histograms with wider context (100bp)
- [X] Get absolute seqlet positions -> query the fasta file

# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
from basepair.plot.profiles import multiple_plot_stranded_profile
from basepair.plot.profiles import multiple_plot_stranded_profile
from basepair.exp.paper.config import *
from basepair.plot.table import render_mpl_table
from basepair.exp.paper.fig4 import *
from basepair.plot.tracks import tidy_motif_plot

paper_config()

fdir = Path(f"{ddir}/figures/modisco/{exp}/long-motifs")
fdir.mkdir(exist_ok=True)
fdir_individual = fdir / 'individual'
fdir_individual.mkdir(exist_ok=True)

model_dir = models_dir / exp

mr_dict = OrderedDict([(task, ModiscoResult(model_dir / f"deeplift/{task}/out/{imp_score}/modisco.h5"))
           for task in tasks])
for k,v in mr_dict.items(): v.open()
patterns_dict = [(task, read_pkl(model_dir / f"deeplift/{task}/out/{imp_score}/patterns.pkl"))
                  for task in tasks]
footprints = {t: read_pkl(model_dir / f"deeplift/{t}/out/{imp_score}/footprints.pkl")
              for t in tasks}

# Load the importance score files
isf = ImpScoreFile(model_dir / 'deeplift.imp_score.h5', default_imp_score=imp_score)
isf.cache()  # load all the regions

Implement enrich seqlets¶

unclustered_patterns_dict = [(task, read_pkl(model_dir / f"deeplift/{task}/out/{imp_score}/unclustered.patterns.pkl"))
                             for task in tasks]

def get_seqlets_bedtool(p):
    task = p.attrs['TF']
    return BedTool(str(model_dir / f"deeplift/{task}/out/{imp_score}/seqlets/{p.name}.bed.gz"))

# re-order the seqlets
out = []
for task, ipatterns in unclustered_patterns_dict:
    mr = mr_dict[task]
    for p in ipatterns:
        n_seqlets = p.attrs['n_seqlets']
        sti = p.attrs['stacked_seqlet_imp']
        sti.dfi = mr.get_seqlet_intervals(p.name, as_df=True)
        o = (sti.aggregate().
               add_attr('TF', task).
               add_attr("n_seqlets", n_seqlets).
               add_attr("stacked_seqlet_imp", sti).
               add_attr("features", {"n seqlets": n_seqlets}))
        o.name = p.name
        out.append(o)

patterns2 = [p for p in out if p.attrs['n_seqlets'] > 100]
patterns_long_name = [p.copy() for p in patterns2]
# update the names
for p in patterns_long_name:
    p.name = p.attrs['TF'] + '/' + p.name
patterns_by_name = {p.name: p for p in patterns_long_name}

patterns = [p for p in patterns_long_name if p.seq_info_content > 4]

len(patterns)

p = patterns[0]

Overlapping seqlet regions with RepeatMasker regions¶

Load the repeat masker file¶

from basepair.exp.paper.repeat_masker import download_repeat_masker, read_repeat_masker, intersect_repeat_masker
from pybedtools import BedTool

dfrm = read_repeat_masker(download_repeat_masker())

Intersect seqlet locations with repeat masker¶

from basepair.modisco.core import patterns_to_df

bt_te = BedTool.from_dataframe(dfrm)

dfi_raw = pd.concat(Parallel(n_jobs=20)(delayed(intersect_repeat_masker)(p.attrs['TF'] + "/" + p.name, get_seqlets_bedtool(p), bt_te, f=0.9) for p in tqdm(patterns2)))

dfi = dfi_raw.copy()

# Restrict only to LTR/
# dfi = dfi[dfi.repeat_family.str.startswith("LTR/")]

# Append some stats
unique_elements = dfi.groupby(['pattern_name']).interval.nunique()
dfiu = dfi[['pattern_name', 'n_pattern']].drop_duplicates().set_index('pattern_name').join(unique_elements)
dfiu['LTR_overlap_frac'] = dfiu.interval / dfiu.n_pattern
dfi = pd.merge(dfi, dfiu.reset_index()[['pattern_name', 'LTR_overlap_frac']], on='pattern_name', how='left')
dfi = dfi.drop_duplicates()

# Append properties
df = patterns_to_df(patterns_long_name, properties=['name', 'short_name', 'seq_info_content'])
df = df.set_index('name').join(dfiu).reset_index()

Scatter-plot: Seq IC ~ LTR_overlap_frac¶

TE_min_seq_IC = 30
TE_min_LTR_Overlap_frac = 0.7

paper_config()

height = get_figsize(0.25)[0]
fig = sns.jointplot("LTR_overlap_frac", "seq_info_content", marginal_kws=dict(bins=15), s=5, data=df, height=height);
fig.ax_joint.axvline(TE_min_LTR_Overlap_frac, linestyle='--', color='grey', alpha=.4);
# fig.ax_joint.axhline(TE_min_seq_IC, linestyle='--', color='grey', alpha=.4);

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/tight_layout.py:199: UserWarning: Tight layout not applied. tight_layout cannot make axes width small enough to accommodate all axes decorations
  warnings.warn('Tight layout not applied. '

fig.savefig(f"{fdir}/LTR_overlap_frac,seq-IC.scatter.pdf")

df['group'] = (df['seq_info_content'] > 30).map({False: 'short', True: 'long'})

Density plot as an alternative to scatter plot¶

plotnine.options.figure_size = get_figsize(0.25)
fig = (ggplot(aes(x='LTR_overlap_frac', color='group'), data=df) +
 geom_density() +
 scale_color_brewer(type='qual', palette=3) + 
 xlab("LTR overlap fraction") +
 theme_classic(base_size=10, base_family='Arial')
)
fig

<ggplot: (-9223363280179665902)>

fig.save(f"{fdir}/LTR_overlap_frac.density.pdf")

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 1.712595 x 1.05838371 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/modisco/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/long-motifs/LTR_overlap_frac.density.pdf
  warn('Filename: {}'.format(filename))

# TE's are in the top right
# pattern_te_names = df[(df.seq_info_content > TE_min_seq_IC) & (df.LTR_overlap_frac > TE_min_LTR_Overlap_frac)].sort_values('LTR_overlap_frac', ascending=False).name.unique()

# TE's are in the top right
pattern_te_names = df[(df.seq_info_content > TE_min_seq_IC)].sort_values('LTR_overlap_frac', ascending=False).name.unique()

len(pattern_te_names)

18

We can see that motif with high IC are frequently in the region of known TE's

dfi.repeat_name.value_counts()

RLTR9E       2098
RLTR13D6     1493
RLTR9D       1015
             ... 
MER125          1
MERVL-int       1
UCON9           1
Name: repeat_name, Length: 886, dtype: int64

dfi.repeat_family.value_counts()

LTR/ERVK           13985
LTR/ERVL-MaLR       3324
SINE/Alu            1828
                   ...  
DNA/TcMar-Pogo         1
DNA/hAT-Tip100?        1
SINE/tRNA-RTE          1
Name: repeat_family, Length: 54, dtype: int64

PFM vs CWM¶

dfp = pd.concat([pd.read_csv(f"{model_dir}/deeplift/{task}/out/{imp_score}/pattern_table.csv").assign(task=task)
                 for task in tasks])

dfp['pattern_name'] = dfp['task'] + '/' + dfp['pattern']
dfp['max_contrib'] = dfp[[f'{task} imp profile' for task in tasks]].max(axis=1)
dfp['sum_contrib'] = dfp[[f'{task} imp profile' for task in tasks]].sum(axis=1)

dfp['contrib'] = [row[f'{row.task} imp profile'] for i, row in dfp.iterrows()]
dfp['length'] = dfp.consensus.str.len()
dfp['total_contrib'] = dfp['contrib'] * dfp['length']
dfp['IC'] = dfp['length'] * dfp['ic pwm mean']
dfp['total_sum_contrib'] = dfp[[f'{task} imp profile' for task in tasks]].sum(axis=1) * dfp['length']

# Main motifs
dfp = dfp[dfp['IC'] > 4]
dfp = dfp[dfp['n seqlets'] > 100]

# dfp['max_contrib_count'] = dfp[[f'{task} imp profile' for task in tasks]].max(axis=1)
# dfp['contrib_count'] = [row[f'{row.task} imp profile'] for i, row in dfp.iterrows()]

dfp.head()

plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='ic pwm mean', y='max_contrib', color='IC'), dfp) + 
 geom_point() + 
 theme_classic()
)

<ggplot: (-9223363278448453979)>

plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='ic pwm mean', y='max_contrib', color='length'), dfp) + 
 geom_point() + 
 theme_classic()
)

<ggplot: (-9223363278448472632)>

plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='ic pwm mean', y='max_contrib', color='length'), dfp) + 
 geom_point() + 
 theme_classic()
)

<ggplot: (8758406225777)>

plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='ic pwm mean', y='contrib', color='length'), dfp) + 
 geom_point() + 
 theme_classic()
)

<ggplot: (-9223363278448605289)>

plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='IC', y='max_contrib', color='length'), dfp) + 
 geom_point() + 
 theme_classic()
)

<ggplot: (8758406225770)>

plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='IC', y='length', color='length'), dfp) + 
 geom_point() + 
 theme_classic()
)

<ggplot: (8758406128802)>

plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='IC', y='contrib', color='length'), dfp) + 
 geom_point() + 
 theme_classic()
)

<ggplot: (-9223363278448711876)>

from adjustText import adjust_text

plotnine.options.figure_size = get_figsize(1.5)
f = (ggplot(aes(x='IC', y='contrib', color='length', label='pattern_name'), dfp) + 
 geom_point() + 
 geom_text(size=7) + 
 theme_classic()
)
f = f.draw()
for a in f.axes:
    texts = [t for t in  a.texts]
    adjust_text(texts,ax=a, arrowprops=dict(arrowstyle='-', color='grey'))

plotnine.options.figure_size = get_figsize(1.5)
f = (ggplot(aes(x='IC', y='sum_contrib', color='length', label='pattern_name'), dfp) + 
 geom_point() + 
 geom_text(size=7) + 
 theme_classic()
)
f = f.draw()
for a in f.axes:
    texts = [t for t in  a.texts]
    adjust_text(texts,ax=a, arrowprops=dict(arrowstyle='-', color='grey'))

plotnine.options.figure_size = get_figsize(1.5)
f = (ggplot(aes(x='ic pwm mean', y='sum_contrib', color='length', label='pattern_name'), dfp) + 
 geom_point() + 
 geom_text(size=7) + 
 theme_classic() + 
 ylab("Average contrib")
)
f = f.draw()
for a in f.axes:
    texts = [t for t in  a.texts]
    adjust_text(texts,ax=a, arrowprops=dict(arrowstyle='-', color='grey'))

plotnine.options.figure_size = get_figsize(1.5)
f = (ggplot(aes(x='IC', y='total_sum_contrib', color='length', label='pattern_name'), dfp) + 
 geom_point() + 
 geom_text(size=7) + 
 theme_classic()
)
f = f.draw()
for a in f.axes:
    texts = [t for t in  a.texts]
    adjust_text(texts,ax=a, arrowprops=dict(arrowstyle='-', color='grey'))

plotnine.options.figure_size = get_figsize(1.5)
f = (ggplot(aes(x='IC', y='total_contrib', color='length', label='pattern_name'), dfp) + 
 geom_point() + 
 geom_text(size=7) + 
 theme_classic()
)
f = f.draw()
for a in f.axes:
    texts = [t for t in  a.texts]
    adjust_text(texts,ax=a, arrowprops=dict(arrowstyle='-', color='grey'))

import hvplot.pandas

dfp.hvplot(x='IC', y='contrib', kind='scatter', value_label='pattern_name', alpha=0.7, persist=False)

plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='IC', y='total_contrib', color='length'), dfp) + 
 geom_point() + 
 theme_classic()
)

<ggplot: (8758406094898)>

from scipy.stats import spearmanr

dfp2 = pd.DataFrame([{"contrib": p.contrib[p.attrs['TF']].sum(), 
                      "IC": p.seq_info_content, 
                      "name": p.name, 
                      "corr": spearmanr(p._get_track('seq_ic').ravel(), p.contrib[p.attrs['TF']].ravel())[0],
                      "log_n_seqlets": np.log10(p.attrs['n_seqlets']), 
                      'is_te': p.seq_info_content > 30} 
                     for p in patterns])

dfp2['contrib-per-IC'] = dfp2['contrib'] / dfp2['IC']

plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='IC', y='contrib', color='is_te'), dfp2) + 
 geom_point() + 
 theme_classic()
)

<ggplot: (8758406212854)>

plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='corr', y='contrib-per-IC', color='is_te'), dfp2) + 
 geom_point() + 
 theme_classic()
)

<ggplot: (8758406454045)>

plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='IC', y='contrib', color='is_te'), dfp2) + 
 geom_point() + 
 theme_classic()
)

<ggplot: (-9223363278447703450)>

plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='IC', y='contrib', color='log_n_seqlets'), dfp2) + 
 geom_point() + 
 theme_classic()
)

<ggplot: (-9223363278448444942)>

dfp.head()

For each pattern, are all seqlets falling into regions of the same LTRs?¶

dfi_top = dfi.drop_duplicates()
dfi_top['n_repeat_name'] = dfi_top.groupby(['pattern_name', 'repeat_name']).repeat_name.transform('size')
dfi_top['n_repeat_frac'] = dfi_top['n_repeat_name'] / dfi_top['n_pattern']
dfi_top1 = dfi_top.groupby(['pattern_name']).apply(lambda x: x.loc[x.n_repeat_frac.idxmax()])

del dfi_top1['chrom']
del dfi_top1['start']
del dfi_top1['end']
del dfi_top1['interval']
del dfi_top1['pattern_name']

len(dfi_top1)

56

Long patterns and their LTR overlap¶

long_patterns = [p for p in patterns_long_name if p.seq_info_content >= 30]
long_patterns_by_name = {p.name: p for p in long_patterns}

long_patterns_clustered = cluster_align_patterns(long_patterns, n_clusters=len(patterns_long_name)//2, trials=20, max_shift=20)

100%|██████████| 18/18 [00:01<00:00, 10.00it/s]
100%|██████████| 18/18 [00:00<00:00, 129.40it/s]

Plot long patterns¶

df_cp = pd.read_excel(f"{fdir}/../overlap_cwm_pwm.xlsx")

df_cp.is_te == 1

0      True
1      True
2      True
      ...  
59    False
60    False
61    False
Name: is_te, Length: 62, dtype: bool

df_cp['pattern_name'] = df_cp['task'] + "/" + df_cp['pattern']
df_cp.set_index('pattern_name', inplace=True)

dfp = df_cp.loc[[p.name for p in long_patterns_clustered]]

print(dfp[['n_pwm_gw', 'n_pwm_peaks', 'n_cwm_peaks', 'n_cwm_and_pwm_peaks', 'n_seqlets']].to_string())

                           n_pwm_gw  n_pwm_peaks  n_cwm_peaks  n_cwm_and_pwm_peaks  n_seqlets
pattern_name                                                                                 
Oct4/metacluster_0/pat...   76989.0       1446.0        914.0                309.0      355.0
Nanog/metacluster_0/pa...   67726.0       2710.0        709.0                640.0      571.0
Sox2/metacluster_0/pat...   73617.0       1288.0       1045.0                502.0      148.0
Nanog/metacluster_0/pa...   86528.0       2286.0        236.0                219.0      203.0
Oct4/metacluster_0/pat...   94727.0       1179.0        138.0                 97.0      138.0
Nanog/metacluster_0/pa...   61720.0       1902.0        187.0                171.0      129.0
Oct4/metacluster_0/pat...   93066.0       1205.0        145.0                 98.0      142.0
Sox2/metacluster_0/pat...   64904.0        550.0       1731.0                217.0      104.0
Nanog/metacluster_0/pa...   65114.0       1158.0        235.0                236.0      300.0
Nanog/metacluster_0/pa...   95160.0       2115.0        129.0                124.0      127.0
Nanog/metacluster_0/pa...   54448.0       2337.0        171.0                170.0      152.0
Nanog/metacluster_0/pa...   93401.0       2973.0        306.0                297.0      134.0
Nanog/metacluster_0/pa...   50691.0       1342.0        269.0                233.0      155.0
Nanog/metacluster_0/pa...   88149.0       3373.0        197.0                198.0      138.0
Klf4/metacluster_0/pat...   59346.0       1287.0        448.0                442.0      334.0
Klf4/metacluster_0/pat...   62180.0       2157.0        512.0                147.0      103.0
Klf4/metacluster_0/pat...   79361.0       3530.0       1797.0               1212.0      179.0
Oct4/metacluster_0/pat...   90990.0       1538.0        541.0                373.0      125.0

df_info = get_df_info(long_patterns_clustered, tasks)

df_info.head()

dft = dfi_top1.loc[[p.name for p in long_patterns_clustered]]

print(dft.to_string())

                           n_pattern     repeat_name  repeat_family  LTR_overlap_frac  n_repeat_name  n_repeat_frac
pattern_name                                                                                                       
Oct4/metacluster_0/pat...        353          RLTR9E       LTR/ERVK            0.9065            139         0.3938
Nanog/metacluster_0/pa...        570          RLTR9E       LTR/ERVK            0.9316            251         0.4404
Sox2/metacluster_0/pat...        148        RLTR13D6       LTR/ERVK            0.9257             66         0.4459
Nanog/metacluster_0/pa...        203  MMERVK9C_I-int       LTR/ERVK            0.8966             87         0.4286
Oct4/metacluster_0/pat...        138  MMERVK9C_I-int       LTR/ERVK            0.8188             56         0.4058
Nanog/metacluster_0/pa...        129          RLTR17       LTR/ERVK            0.9147             60         0.4651
Oct4/metacluster_0/pat...        140         RLTR9A3       LTR/ERVK            0.9429             83         0.5929
Sox2/metacluster_0/pat...        104         RMER10A       LTR/ERVL            0.7885             24         0.2308
Nanog/metacluster_0/pa...        300        RLTR13D6       LTR/ERVK            0.8100            181         0.6033
Nanog/metacluster_0/pa...        127         RLTR13G       LTR/ERVK            0.7638             49         0.3858
Nanog/metacluster_0/pa...        152        RLTR13D6       LTR/ERVK            0.8816            103         0.6776
Nanog/metacluster_0/pa...        134        RLTR13D6       LTR/ERVK            0.9104             47         0.3507
Nanog/metacluster_0/pa...        155           BGLII       LTR/ERVK            0.8387             75         0.4839
Nanog/metacluster_0/pa...        138        RLTR13D6       LTR/ERVK            0.9783            101         0.7319
Klf4/metacluster_0/pat...        331        RLTR13B1       LTR/ERVK            0.9728            122         0.3686
Klf4/metacluster_0/pat...        103             MTD  LTR/ERVL-MaLR            0.0194              1         0.0097
Klf4/metacluster_0/pat...        179          ORR1A2  LTR/ERVL-MaLR            0.8659             39         0.2179
Oct4/metacluster_0/pat...        125       (TTAGGG)n  Simple_repeat            0.8080             22         0.1760

# clustered version of the plot
fig, axes = plt.subplots(len(long_patterns), 1, figsize=get_figsize(.5, aspect=1.5))
                         # gridspec_kw=dict(wspace=10))
# fig, axes = plt.subplots(2, 7, figsize=get_figsize(2, aspect=1/2))
# fig.subplots_adjust(hspace=0, wspace=2)

for i, p in enumerate(long_patterns_clustered):
    #p = pl[pname]
     
#     if p.name in main_patterns:
#         continue
    # Motif logo
    
    ltr_name = dft.loc[p.name].repeat_name
    ltr_frac = dft.loc[p.name].LTR_overlap_frac
    
    ax = axes[i]
    # Text columns before
    seqlogo(p.get_seq_ic(), ax=ax)
    ax.set_ylim([0, 2])  # all plots have the same shape
    strip_axis(ax)
    ax.axison = False
    if i == 0:
        ax.set_title("Sequence information content")

    task, mc, pl = p.name.split("/")
    pname = task + "/" + pl.split('_')[1]
    
    # text in before of the motif
    ax.text(-25, 0, pname, fontsize=8, horizontalalignment='right')
    ax.text(-15, 0, str(p.attrs['n_seqlets']), fontsize=8, horizontalalignment='right')
    ax.text(-5, 0, int(dfp.loc[p.name].n_cwm_peaks), fontsize=8, horizontalalignment='right')
    
    # text after the motif
    ax.text(75, 0, f"{ltr_frac:.2f}", fontsize=8, horizontalalignment='left')
    ax.text(85, 0, ltr_name, fontsize=8, horizontalalignment='left')
fig.savefig(f'{fdir}/pattern-table.all.pdf')

main_patterns = dft.groupby("repeat_name").LTR_overlap_frac.idxmax().values

dft.reset_index().to_csv(f"{fdir}/long_patterns.tsv", index=False, sep='\t')

dfi_top1

dfi_top1.LTR_overlap_frac.plot.hist(30);

dfi_top1.reset_index().to_excel(f"{fdir}/../repeat-masker-overlap.xlsx", index=False)

dfi_top1

Seq-logo¶

p = long_patterns[0]

p.name

'Oct4/metacluster_0/pattern_9'

def shorten_pname(name):
    """'Oct4/metacluster_0/pattern_9' -> Oct4.p0
    """
    tf, mc, pn = name.split("/")
    pn_id = pn.replace("pattern_", "")
    return f"{tf}.p{pn_id}"

for p in long_patterns:
    ymax = max([v.max() for v in p.contrib.values()])
    ymin = min([v.min() for v in p.contrib.values()])
    ltr_name = dft.loc[p.name].repeat_name
    ltr_frac = dft.loc[p.name].LTR_overlap_frac

    # Plot the other factor
    long_patterns_by_name[p.name].plot(['seq_ic', 'contrib'], 
           rotate_y=0, 
           letter_width=0.1, 
           height=0.4,
           ylim=[(0, 2)] + [(ymin, ymax)]*4,
           title=f"{p.name},  LTR: {ltr_name}, any LTR frac: {ltr_frac:.2f}");
    sns.despine(top=True, right=True, left=False, bottom=True)
    out_name = shorten_pname(p.name) + "." + ltr_name
    plt.savefig(f"{fdir_individual}/{out_name}.contrib.pdf")

Sequence heatmap¶

import collections
from basepair.plot.tracks import *

def plot_track(arr, ax, legend=False, ylim=None, color=None):
    """Plot a track
    """
    seqlen = len(arr)
    if arr.ndim == 1 or arr.shape[1] == 1:
        # single track
        if color is not None:
            if isinstance(color, collections.Sequence):
                color = color[0]
        ax.plot(np.arange(1, seqlen + 1), np.ravel(arr), color=color)
    elif arr.shape[1] == 4:
        # plot seqlogo
        seqlogo(arr, ax=ax)
    elif arr.shape[1] == 2:
        # plot both strands
        if color is not None:
            assert isinstance(color, collections.Sequence)
            c1 = color[0]
            c2 = color[1]
        else:
            c1, c2 = None, None
        ax.plot(np.arange(1, seqlen + 1), arr[:, 0], label='pos', color=c1)
        ax.plot(np.arange(1, seqlen + 1), arr[:, 1], label='neg', color=c2)
        if legend:
            ax.legend()
    else:
        raise ValueError(f"Don't know how to plot array with shape[1] != {arr.shape[1]}. Valid values are: 1,2 or 4.")
    if ylim is not None:
        ax.set_ylim(ylim)

def plot_tracks(tracks, seqlets=[], title=None, rotate_y=90, legend=False, fig_width=20,
                fig_height_per_track=2, ylim=None, same_ylim=False, ylab=True, color=None):
    """Plot a multiple tracks.

    One-hot-encoded sequence as a logo,
    and 1 or 2 dim tracks as normal line-plots.

    Args:
      tracks: dictionary of numpy arrays with the same axis0 length
      fig_width: figure width
      fig_height_per_track: figure height per track.
      ylim: if not None, a single tuple or a list of tuples representing the ylim to use

    Returns:
      matplotlib.figure.Figure
    """
    
    tracks = skip_nan_tracks(tracks)  # ignore None values
    fig, axes = plt.subplots(len(tracks), 1,
                             figsize=(fig_width, fig_height_per_track * len(tracks)),
                             sharex=True)

    if len(tracks) == 1:
        axes = [axes]

    if same_ylim:
        ylim = (0, max([v.max() for k,v in get_items(tracks)]))
        
    for i, (ax, (track, arr)) in enumerate(zip(axes, get_items(tracks))):
        plot_track(arr, ax, legend, get_list_value(ylim, i), 
                   color=get_list_value(color, i))
        for seqlet in seqlets:
            plot_seqlet(seqlet, ax, add_label=i == 0)
        # ax.set_ylabel(track)
        if ylab:
            ax.set_ylabel(track, rotation=rotate_y,
                          multialignment='center',
                          ha='right', labelpad=5)
        simple_yaxis_format(ax)
        if i != len(tracks) - 1:
            ax.xaxis.set_ticks_position('none')
        if i == 0 and title is not None:
            ax.set_title(title)

        # if seqlets:
        #    pass

    # add ticks to the final axis
    ax.xaxis.set_major_locator(ticker.AutoLocator())
    # ax.xaxis.set_major_locator(ticker.MaxNLocator(4))
    # spaced_xticks(ax, spacing=5)
    fig.subplots_adjust(hspace=0)
    #fig.subplots_adjust(left=0, right=1, top=1, bottom=0)
    # cleanup the plot
    return fig


def get_tracks(p, tasks):

    def flip_neg(x):
        x = x.copy()
        if x.shape[1] == 2:
            x[:,1] = -x[:,1]
        return x

    tracks = OrderedDict([(kind, p._get_track(kind)) for kind in ['profile', 'seq_ic', 'contrib']])
    tracks = [('seq_ic', tracks['seq_ic'])] + [
        (task, flip_neg(tracks[track][task]))
        for task in tasks
        for track in ['profile', 'contrib']
    ]
    return tracks

pmax_dict = {t: max([p.profile[t].max() for p in long_patterns_clustered
                    if p.name != 'Klf4/metacluster_0/pattern_18']) 
             for t in tasks}
ylim_profiles = {t: (0, pmax_dict[t]) for t in p.tasks()}

ymax = max([v.max() for v in p.contrib.values()])
ymin = min([v.min() for v in p.contrib.values()])

ylim_profiles

# ylim=ylim_profiles + [(0, 2)] + [(ymin, ymax)]*4
letter_width=0.1
height=0.4
rotate_y=0
colors = [None]
for task in tasks:
    colors.append((tf_colors[task], tf_colors[task] + "80"))  # 80 add alpha=0.5
    colors.append(None)

Seqlogo + profile¶

pmax_dict = {t: max([p.profile[t].max() for p in long_patterns_clustered
                    if p.name != 'Oct4/metacluster_0/pattern_14']) 
             for t in tasks}
ylim_profiles = [(0, pmax_dict[t]) for t in p.tasks()]
for p in long_patterns:
    ymax = max([v.max() for v in p.contrib.values()])
    ymin = min([v.min() for v in p.contrib.values()])
    ltr_name = dft.loc[p.name].repeat_name
    ltr_frac = dft.loc[p.name].LTR_overlap_frac
    
    pymax = max([v.max() for v in p.profile.values()])
    pymin = min([v.min() for v in p.profile.values()])

    ylim_profiles = [(0, pmax_dict[t]) for t in p.tasks()]
    
    tracks = get_tracks(long_patterns_by_name[p.name], tasks)
    w,h = get_figsize(0.6, 1)
    plot_tracks(tracks,
               # title=self.name,
               rotate_y=rotate_y,
               fig_width=w,
               fig_height_per_track=h / 9,
               color=colors,
               ylim=[(0, 2)] + [(-pmax_dict[task], pmax_dict[task]) if which == 'profile' else (ymin, ymax)
                                for task in tasks for which in ['profile', 'imp']],
               # height_ratios=[1] + [x for i in range(len(tasks)) for x in [1.2, 0.8]],
               title=f"{p.name},  LTR: {ltr_name}, any LTR frac: {ltr_frac:.2f}",
               ylab=ylab);
    sns.despine(top=True, right=True, left=False, bottom=True)
    
    out_name = shorten_pname(p.name) + "." + ltr_name
    plt.savefig(f"{fdir_individual}/{out_name}.contrib-w-profiles.pdf")

Sequence heatmaps¶

from basepair.plot.heatmaps import heatmap_sequence
from genomelake.extractors import FastaExtractor
from basepair.preproc import resize_interval, interval_center
from basepair.exp.te.dist import sort_seqs_kimura, consensus_dist_kimura

dfi_cols = ['chrom', 'start', 'end', 'interval_from_task', 'seqname', 'strand']

# fe = FastaExtractor(fasta_file, use_strand=True)

# for p in long_patterns:
#     dfi = p.attrs['stacked_seqlet_imp'].dfi[dfi_cols]
#     seqs = fe([resize_interval(x, 200) for x in BedTool.from_dataframe(dfi)])
#     fig = heatmap_sequence(sort_seqs_kimura(seqs), aspect=0.5, figsize_tmpl=get_figsize(.5, 20), cbar=False)
    
#     # Add the title
#     ltr_name = dft.loc[p.name].repeat_name
#     ltr_frac = dft.loc[p.name].LTR_overlap_frac
#     plt.title(f"{p.name}, \nLTR: {ltr_name}, any LTR frac: {ltr_frac:.2f}");    
    
#     out_name = shorten_pname(p.name) + "." + ltr_name
#     plt.savefig(f"{fdir_individual}/{out_name}.seq.pdf")
#     plt.savefig(f"{fdir_individual}/{out_name}.seq.png")

ranges = isf.get_ranges()
ranges.columns = ['example_' + c for c in ranges.columns]

dfi = p.attrs['stacked_seqlet_imp'].dfi

profile_width = 200
for p in long_patterns:
    # Get LTR names
    ltr_name = dft.loc[p.name].repeat_name
    ltr_frac = dft.loc[p.name].LTR_overlap_frac
    
    # Extract stacked seqlets again
    dfi = p.attrs['stacked_seqlet_imp'].dfi.copy()
    # adopt dfi to the right format
    dfi = resize_interval(dfi, width=profile_width)
    dfi['center'] = interval_center(dfi)
    task = p.name.split("/")[0]
    ranges_subset = ranges[ranges.example_interval_from_task == task]
    ranges_subset['example_idx_new'] = np.arange(len(ranges_subset))
    dfi = pd.merge(dfi, ranges_subset, left_on='seqname', right_on='example_idx_new')
    assert np.all(dfi.example_chrom == dfi.chrom)
    assert np.all(dfi.center > dfi.example_start)
    assert np.all(dfi.center < dfi.example_end)
    dfi['pattern_center'] = interval_center(dfi) - dfi['example_start']
    dfi['pattern_start'] = dfi['start'] - dfi['example_start']
    dfi['pattern_end'] = dfi['end'] - dfi['example_start']
    simp = isf.extract_dfi(dfi, profile_width=profile_width)
    # Re-sort the sequence + profiles according to the kimura distance
    simp = simp[np.argsort(consensus_dist_kimura(simp.seq))]
    
    # Plot sequence
    fig = simp.plot("seq", aspect=0.5, figsize_tmpl=get_figsize(.5, 20), cbar=False)
    plt.title(f"{p.name}, \nLTR: {ltr_name}, any LTR frac: {ltr_frac:.2f}");    
    
    out_name = shorten_pname(p.name) + "." + ltr_name
    fig.savefig(f"{fdir_individual}/{out_name}.heatmap-seq.pdf", raster=True)
    
    # Plot profile
    fig = simp.plot("profile", figsize=get_figsize(2, 1))
    fig.savefig(f"{fdir_individual}/{out_name}.heatmap-profile.pdf", raster=True)

Removed 8/355 instances at the boundaries
Removed 2/138 instances at the boundaries
Removed 2/125 instances at the boundaries
Removed 5/142 instances at the boundaries
Removed 2/104 instances at the boundaries
Removed 1/148 instances at the boundaries
Removed 6/571 instances at the boundaries
Removed 3/300 instances at the boundaries
Removed 5/127 instances at the boundaries
Removed 6/334 instances at the boundaries
Removed 4/179 instances at the boundaries
Removed 13/103 instances at the boundaries

# Save the color-bar
from concise.preprocessing import encodeDNA

fig = heatmap_sequence(encodeDNA(["A"*100]*200), aspect=1, figsize_tmpl=get_figsize(1, 5), cbar=True);
fig.savefig(f"{fdir}/colorbar.pdf")

Visualizing the profiles for the 'main' pattern per LTR¶

pl = {p.name: p for p in patterns_long_name}

df_info = get_df_info(long_patterns_clustered, tasks)

p = long_patterns[0]

p.attrs['n_seqlets']

355

long_patterns[0].name

'Oct4/metacluster_0/pattern_9'

# fp_slice = slice(25, 175)
fp_slice = slice(10, 190)

max_vals = {t: df_info.max()[t + "_max"] for t in tasks}

fig, axes = plt.subplots(len(long_patterns_clustered), 8, figsize=get_figsize(1, aspect=1))
# fig, axes = plt.subplots(2, 7, figsize=get_figsize(2, aspect=1/2))
fig.subplots_adjust(hspace=0, wspace=0)

# Get the ylim for each TF
contrib_ylim = {tf: (min([p.contrib[p.attrs['TF']].min() for p in long_patterns_clustered if p.attrs['TF'] == tf] + [0]),
                     max([p.contrib[p.attrs['TF']].max() for p in long_patterns_clustered if p.attrs['TF'] == tf] + [0]))
               for tf in tasks}

for i, p in enumerate(long_patterns_clustered):
    #p = pl[pname]
     
#     if p.name in main_patterns:
#         continue
    # Motif logo
    
    ltr_name = dft.loc[p.name].repeat_name
    ltr_frac = dft.loc[p.name].LTR_overlap_frac
    
    
    ax = axes[i, 0]
    # Text columns before
    ax.text(-1, 0, p.attrs["TF"] + "\n" + str(p.attrs["n_seqlets"]), fontsize=8, horizontalalignment='right')
    seqlogo(p.get_seq_ic(), ax=ax)
    ax.set_ylim([0, 2])  # all plots have the same shape
    strip_axis(ax)
    ax.axison = False
    pos1 = ax.get_position() # get the original position 
    pos2 = [pos1.x0, pos1.y0 + pos1.height * 0.4,  pos1.width *3, pos1.height * .5]
    ax.set_position(pos2) # set a new position
    if i == 0:
        ax.set_title("Sequence\ninfo. content")

    # TOOD 
    ax = axes[i, 1]
    blank_ax(ax)
    ax = axes[i, 2]
    blank_ax(ax)
    ax = axes[i, 3]
    blank_ax(ax)
    ax.text(1, 0, ltr_name + f"\n{ltr_frac:.2f}", fontsize=8, horizontalalignment='right')
    
    # Profile columns
    for j, task in enumerate(tasks):
        ax = axes[i, 4 + j]
        fp = p.profile[task]
        
        ax.plot(fp[fp_slice, 0], color=tf_colors[task])
        ax.plot(-fp[fp_slice, 1], color=tf_colors[task], alpha=0.5)  # negative strand
        
        ax.set_ylim([-max_vals[task], max_vals[task]])  # all plots have the same shape
        strip_axis(ax)
        ax.axison = False
        
        if i == 0:
            ax.set_title(task)

fig.savefig(f'{fdir}/pattern-table.all.pdf')

	Klf4 footprint counts	Klf4 footprint entropydiff	Klf4 footprint max	Klf4 footprint strandcor	Klf4 imp counts	Klf4 imp profile	Klf4 periodicity 10bp	Klf4 pos meandiff	Klf4 pos std	Klf4 pos unimodal	Klf4 region counts	Nanog footprint counts	Nanog footprint entropydiff	Nanog footprint max	Nanog footprint strandcor	Nanog imp counts	Nanog imp profile	Nanog periodicity 10bp	Nanog pos meandiff	Nanog pos std	Nanog pos unimodal	Nanog region counts	Oct4 footprint counts	Oct4 footprint entropydiff	Oct4 footprint max	Oct4 footprint strandcor	Oct4 imp counts	Oct4 imp profile	Oct4 periodicity 10bp	Oct4 pos meandiff	Oct4 pos std	Oct4 pos unimodal	Oct4 region counts	Sox2 footprint counts	Sox2 footprint entropydiff	Sox2 footprint max	Sox2 footprint strandcor	Sox2 imp counts	Sox2 imp profile	Sox2 periodicity 10bp	Sox2 pos meandiff	Sox2 pos std	Sox2 pos unimodal	Sox2 region counts	consensus	ic pwm mean	idx	n seqlets	pattern	task	pattern_name	max_contrib	length	IC	contrib	total_contrib	max_contrib_count	contrib_count
0	143.6074	0.0257	0.5459	0.0051	0.0203	0.0105	0.0686	NaN	NaN	NaN	402.9995	473.8224	0.0760	2.3823	0.0053	0.0538	0.0236	0.3644	NaN	NaN	NaN	956.8774	263.7851	0.1400	2.9668	0.0057	0.0672	0.0255	0.0679	1.9860	121.6666	True	572.4199	88.1211	0.1908	1.1994	0.0060	0.0480	0.0194	0.1548	NaN	NaN	NaN	205.1461	CCTTTGTTATGCAAAT	0.8710	0	10742	m0_p0	Oct4	Oct4/m0_p0	0.0255	16	13.9357	0.0255	0.4078	0.0255	0.0255
1	149.8835	0.0166	0.5083	0.0050	0.0076	0.0053	0.1108	NaN	NaN	NaN	429.0092	364.6005	0.0323	1.3482	0.0050	0.0212	0.0110	0.1445	NaN	NaN	NaN	822.5327	184.1158	0.0282	0.7830	0.0051	0.0301	0.0136	0.0723	1.9790	122.8195	True	472.7943	53.2370	0.0244	0.2550	0.0050	0.0186	0.0068	0.1040	NaN	NaN	NaN	159.0560	TTATGCAAAT	1.0574	1	1517	m0_p1	Oct4	Oct4/m0_p1	0.0136	10	10.5739	0.0136	0.1359	0.0136	0.0136
2	152.2931	0.0188	0.5257	0.0050	0.0090	0.0061	0.0577	NaN	NaN	NaN	428.6114	403.7701	0.0323	1.5600	0.0051	0.0278	0.0145	0.2506	NaN	NaN	NaN	870.4996	201.9634	0.0427	0.9988	0.0051	0.0336	0.0146	0.0299	2.0021	118.7279	True	502.7864	60.7334	0.0370	0.3309	0.0050	0.0227	0.0087	0.0897	NaN	NaN	NaN	170.4241	ATATGCAAAT	1.1637	2	1297	m0_p2	Oct4	Oct4/m0_p2	0.0146	10	11.6365	0.0146	0.1459	0.0146	0.0146
3	240.1971	0.0313	1.0273	0.0051	0.0121	0.0074	0.0540	NaN	NaN	NaN	570.9031	1102.3789	0.1304	6.9842	0.0054	0.0502	0.0264	0.4378	NaN	NaN	NaN	1782.2529	240.9923	0.0220	0.9737	0.0050	0.0267	0.0073	0.0769	1.9750	106.9080	True	554.0219	152.1856	0.1517	1.9368	0.0058	0.0291	0.0174	0.2354	NaN	NaN	NaN	295.8698	CCTTTGTTCT	0.9832	3	1052	m0_p3	Oct4	Oct4/m0_p3	0.0264	10	9.8321	0.0073	0.0730	0.0264	0.0073
4	347.7702	0.1958	3.1925	0.0059	0.0673	0.0439	0.0247	NaN	NaN	NaN	738.7773	577.4963	0.0324	2.5325	0.0050	0.0152	0.0110	0.1467	NaN	NaN	NaN	1245.0659	197.0388	0.0409	1.0194	0.0049	0.0202	0.0068	0.0381	1.9809	176.7330	True	526.9279	75.3001	0.0351	0.3578	0.0050	0.0174	0.0061	0.0606	NaN	NaN	NaN	202.1866	GCCACACCCAG	1.0765	4	970	m0_p4	Oct4	Oct4/m0_p4	0.0439	11	11.8415	0.0068	0.0747	0.0439	0.0068

	Klf4 footprint counts	Klf4 footprint entropydiff	Klf4 footprint max	Klf4 footprint strandcor	Klf4 imp counts	Klf4 imp profile	Klf4 periodicity 10bp	Klf4 pos meandiff	Klf4 pos std	Klf4 pos unimodal	Klf4 region counts	Nanog footprint counts	Nanog footprint entropydiff	Nanog footprint max	Nanog footprint strandcor	Nanog imp counts	Nanog imp profile	Nanog periodicity 10bp	Nanog pos meandiff	Nanog pos std	Nanog pos unimodal	Nanog region counts	Oct4 footprint counts	Oct4 footprint entropydiff	Oct4 footprint max	Oct4 footprint strandcor	Oct4 imp counts	Oct4 imp profile	Oct4 periodicity 10bp	Oct4 pos meandiff	Oct4 pos std	Oct4 pos unimodal	Oct4 region counts	Sox2 footprint counts	Sox2 footprint entropydiff	Sox2 footprint max	Sox2 footprint strandcor	Sox2 imp counts	Sox2 imp profile	Sox2 periodicity 10bp	Sox2 pos meandiff	Sox2 pos std	Sox2 pos unimodal	Sox2 region counts	consensus	ic pwm mean	idx	n seqlets	pattern	task
0	143.6074	0.0257	0.5459	0.0051	0.0203	0.0105	0.0686	NaN	NaN	NaN	402.9995	473.8224	0.0760	2.3823	0.0053	0.0538	0.0236	0.3644	NaN	NaN	NaN	956.8774	263.7851	0.1400	2.9668	0.0057	0.0672	0.0255	0.0679	1.9860	121.6666	True	572.4199	88.1211	0.1908	1.1994	0.0060	0.0480	0.0194	0.1548	NaN	NaN	NaN	205.1461	CCTTTGTTATGCAAAT	0.8710	0	10742	m0_p0	Oct4
1	149.8835	0.0166	0.5083	0.0050	0.0076	0.0053	0.1108	NaN	NaN	NaN	429.0092	364.6005	0.0323	1.3482	0.0050	0.0212	0.0110	0.1445	NaN	NaN	NaN	822.5327	184.1158	0.0282	0.7830	0.0051	0.0301	0.0136	0.0723	1.9790	122.8195	True	472.7943	53.2370	0.0244	0.2550	0.0050	0.0186	0.0068	0.1040	NaN	NaN	NaN	159.0560	TTATGCAAAT	1.0574	1	1517	m0_p1	Oct4
2	152.2931	0.0188	0.5257	0.0050	0.0090	0.0061	0.0577	NaN	NaN	NaN	428.6114	403.7701	0.0323	1.5600	0.0051	0.0278	0.0145	0.2506	NaN	NaN	NaN	870.4996	201.9634	0.0427	0.9988	0.0051	0.0336	0.0146	0.0299	2.0021	118.7279	True	502.7864	60.7334	0.0370	0.3309	0.0050	0.0227	0.0087	0.0897	NaN	NaN	NaN	170.4241	ATATGCAAAT	1.1637	2	1297	m0_p2	Oct4
3	240.1971	0.0313	1.0273	0.0051	0.0121	0.0074	0.0540	NaN	NaN	NaN	570.9031	1102.3789	0.1304	6.9842	0.0054	0.0502	0.0264	0.4378	NaN	NaN	NaN	1782.2529	240.9923	0.0220	0.9737	0.0050	0.0267	0.0073	0.0769	1.9750	106.9080	True	554.0219	152.1856	0.1517	1.9368	0.0058	0.0291	0.0174	0.2354	NaN	NaN	NaN	295.8698	CCTTTGTTCT	0.9832	3	1052	m0_p3	Oct4
4	347.7702	0.1958	3.1925	0.0059	0.0673	0.0439	0.0247	NaN	NaN	NaN	738.7773	577.4963	0.0324	2.5325	0.0050	0.0152	0.0110	0.1467	NaN	NaN	NaN	1245.0659	197.0388	0.0409	1.0194	0.0049	0.0202	0.0068	0.0381	1.9809	176.7330	True	526.9279	75.3001	0.0351	0.3578	0.0050	0.0174	0.0061	0.0606	NaN	NaN	NaN	202.1866	GCCACACCCAG	1.0765	4	970	m0_p4	Oct4

	n_pattern	repeat_name	repeat_family	LTR_overlap_frac	n_repeat_name	n_repeat_frac
pattern_name
Klf4/metacluster_0/pattern_0	9134	RLTR9E	LTR/ERVK	0.4940	619	0.0678
Klf4/metacluster_0/pattern_1	1988	RLTR13D6	LTR/ERVK	0.4145	68	0.0342
Klf4/metacluster_0/pattern_10	179	ORR1A2	LTR/ERVL-MaLR	0.8659	39	0.2179
...	...	...	...	...	...	...
Sox2/metacluster_0/pattern_5	132	ORR1C1	LTR/ERVL-MaLR	0.6136	25	0.1894
Sox2/metacluster_0/pattern_6	104	RMER10A	LTR/ERVL	0.7885	24	0.2308
Sox2/metacluster_0/pattern_8	148	RLTR13D6	LTR/ERVK	0.9257	66	0.4459