exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'
Three things to show for the long motifs
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
from basepair.plot.profiles import multiple_plot_stranded_profile
from basepair.plot.profiles import multiple_plot_stranded_profile
from basepair.exp.paper.config import *
from basepair.plot.table import render_mpl_table
from basepair.exp.paper.fig4 import *
from basepair.plot.tracks import tidy_motif_plot
paper_config()
fdir = Path(f"{ddir}/figures/modisco/{exp}/long-motifs")
fdir.mkdir(exist_ok=True)
fdir_individual = fdir / 'individual'
fdir_individual.mkdir(exist_ok=True)
model_dir = models_dir / exp
mr_dict = OrderedDict([(task, ModiscoResult(model_dir / f"deeplift/{task}/out/{imp_score}/modisco.h5"))
for task in tasks])
for k,v in mr_dict.items(): v.open()
patterns_dict = [(task, read_pkl(model_dir / f"deeplift/{task}/out/{imp_score}/patterns.pkl"))
for task in tasks]
footprints = {t: read_pkl(model_dir / f"deeplift/{t}/out/{imp_score}/footprints.pkl")
for t in tasks}
# Load the importance score files
isf = ImpScoreFile(model_dir / 'deeplift.imp_score.h5', default_imp_score=imp_score)
isf.cache() # load all the regions
unclustered_patterns_dict = [(task, read_pkl(model_dir / f"deeplift/{task}/out/{imp_score}/unclustered.patterns.pkl"))
for task in tasks]
def get_seqlets_bedtool(p):
task = p.attrs['TF']
return BedTool(str(model_dir / f"deeplift/{task}/out/{imp_score}/seqlets/{p.name}.bed.gz"))
# re-order the seqlets
out = []
for task, ipatterns in unclustered_patterns_dict:
mr = mr_dict[task]
for p in ipatterns:
n_seqlets = p.attrs['n_seqlets']
sti = p.attrs['stacked_seqlet_imp']
sti.dfi = mr.get_seqlet_intervals(p.name, as_df=True)
o = (sti.aggregate().
add_attr('TF', task).
add_attr("n_seqlets", n_seqlets).
add_attr("stacked_seqlet_imp", sti).
add_attr("features", {"n seqlets": n_seqlets}))
o.name = p.name
out.append(o)
patterns2 = [p for p in out if p.attrs['n_seqlets'] > 100]
patterns_long_name = [p.copy() for p in patterns2]
# update the names
for p in patterns_long_name:
p.name = p.attrs['TF'] + '/' + p.name
patterns_by_name = {p.name: p for p in patterns_long_name}
patterns = [p for p in patterns_long_name if p.seq_info_content > 4]
len(patterns)
p = patterns[0]
from basepair.exp.paper.repeat_masker import download_repeat_masker, read_repeat_masker, intersect_repeat_masker
from pybedtools import BedTool
dfrm = read_repeat_masker(download_repeat_masker())
from basepair.modisco.core import patterns_to_df
bt_te = BedTool.from_dataframe(dfrm)
dfi_raw = pd.concat(Parallel(n_jobs=20)(delayed(intersect_repeat_masker)(p.attrs['TF'] + "/" + p.name, get_seqlets_bedtool(p), bt_te, f=0.9) for p in tqdm(patterns2)))
dfi = dfi_raw.copy()
# Restrict only to LTR/
# dfi = dfi[dfi.repeat_family.str.startswith("LTR/")]
# Append some stats
unique_elements = dfi.groupby(['pattern_name']).interval.nunique()
dfiu = dfi[['pattern_name', 'n_pattern']].drop_duplicates().set_index('pattern_name').join(unique_elements)
dfiu['LTR_overlap_frac'] = dfiu.interval / dfiu.n_pattern
dfi = pd.merge(dfi, dfiu.reset_index()[['pattern_name', 'LTR_overlap_frac']], on='pattern_name', how='left')
dfi = dfi.drop_duplicates()
# Append properties
df = patterns_to_df(patterns_long_name, properties=['name', 'short_name', 'seq_info_content'])
df = df.set_index('name').join(dfiu).reset_index()
TE_min_seq_IC = 30
TE_min_LTR_Overlap_frac = 0.7
paper_config()
height = get_figsize(0.25)[0]
fig = sns.jointplot("LTR_overlap_frac", "seq_info_content", marginal_kws=dict(bins=15), s=5, data=df, height=height);
fig.ax_joint.axvline(TE_min_LTR_Overlap_frac, linestyle='--', color='grey', alpha=.4);
# fig.ax_joint.axhline(TE_min_seq_IC, linestyle='--', color='grey', alpha=.4);
fig.savefig(f"{fdir}/LTR_overlap_frac,seq-IC.scatter.pdf")
df['group'] = (df['seq_info_content'] > 30).map({False: 'short', True: 'long'})
plotnine.options.figure_size = get_figsize(0.25)
fig = (ggplot(aes(x='LTR_overlap_frac', color='group'), data=df) +
geom_density() +
scale_color_brewer(type='qual', palette=3) +
xlab("LTR overlap fraction") +
theme_classic(base_size=10, base_family='Arial')
)
fig
fig.save(f"{fdir}/LTR_overlap_frac.density.pdf")
# TE's are in the top right
# pattern_te_names = df[(df.seq_info_content > TE_min_seq_IC) & (df.LTR_overlap_frac > TE_min_LTR_Overlap_frac)].sort_values('LTR_overlap_frac', ascending=False).name.unique()
# TE's are in the top right
pattern_te_names = df[(df.seq_info_content > TE_min_seq_IC)].sort_values('LTR_overlap_frac', ascending=False).name.unique()
len(pattern_te_names)
We can see that motif with high IC are frequently in the region of known TE's
dfi.repeat_name.value_counts()
dfi.repeat_family.value_counts()
dfp = pd.concat([pd.read_csv(f"{model_dir}/deeplift/{task}/out/{imp_score}/pattern_table.csv").assign(task=task)
for task in tasks])
dfp['pattern_name'] = dfp['task'] + '/' + dfp['pattern']
dfp['max_contrib'] = dfp[[f'{task} imp profile' for task in tasks]].max(axis=1)
dfp['sum_contrib'] = dfp[[f'{task} imp profile' for task in tasks]].sum(axis=1)
dfp['contrib'] = [row[f'{row.task} imp profile'] for i, row in dfp.iterrows()]
dfp['length'] = dfp.consensus.str.len()
dfp['total_contrib'] = dfp['contrib'] * dfp['length']
dfp['IC'] = dfp['length'] * dfp['ic pwm mean']
dfp['total_sum_contrib'] = dfp[[f'{task} imp profile' for task in tasks]].sum(axis=1) * dfp['length']
# Main motifs
dfp = dfp[dfp['IC'] > 4]
dfp = dfp[dfp['n seqlets'] > 100]
# dfp['max_contrib_count'] = dfp[[f'{task} imp profile' for task in tasks]].max(axis=1)
# dfp['contrib_count'] = [row[f'{row.task} imp profile'] for i, row in dfp.iterrows()]
dfp.head()
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='ic pwm mean', y='max_contrib', color='IC'), dfp) +
geom_point() +
theme_classic()
)
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='ic pwm mean', y='max_contrib', color='length'), dfp) +
geom_point() +
theme_classic()
)
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='ic pwm mean', y='max_contrib', color='length'), dfp) +
geom_point() +
theme_classic()
)
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='ic pwm mean', y='contrib', color='length'), dfp) +
geom_point() +
theme_classic()
)
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='IC', y='max_contrib', color='length'), dfp) +
geom_point() +
theme_classic()
)
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='IC', y='length', color='length'), dfp) +
geom_point() +
theme_classic()
)
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='IC', y='contrib', color='length'), dfp) +
geom_point() +
theme_classic()
)
from adjustText import adjust_text
plotnine.options.figure_size = get_figsize(1.5)
f = (ggplot(aes(x='IC', y='contrib', color='length', label='pattern_name'), dfp) +
geom_point() +
geom_text(size=7) +
theme_classic()
)
f = f.draw()
for a in f.axes:
texts = [t for t in a.texts]
adjust_text(texts,ax=a, arrowprops=dict(arrowstyle='-', color='grey'))
plotnine.options.figure_size = get_figsize(1.5)
f = (ggplot(aes(x='IC', y='sum_contrib', color='length', label='pattern_name'), dfp) +
geom_point() +
geom_text(size=7) +
theme_classic()
)
f = f.draw()
for a in f.axes:
texts = [t for t in a.texts]
adjust_text(texts,ax=a, arrowprops=dict(arrowstyle='-', color='grey'))
plotnine.options.figure_size = get_figsize(1.5)
f = (ggplot(aes(x='ic pwm mean', y='sum_contrib', color='length', label='pattern_name'), dfp) +
geom_point() +
geom_text(size=7) +
theme_classic() +
ylab("Average contrib")
)
f = f.draw()
for a in f.axes:
texts = [t for t in a.texts]
adjust_text(texts,ax=a, arrowprops=dict(arrowstyle='-', color='grey'))
plotnine.options.figure_size = get_figsize(1.5)
f = (ggplot(aes(x='IC', y='total_sum_contrib', color='length', label='pattern_name'), dfp) +
geom_point() +
geom_text(size=7) +
theme_classic()
)
f = f.draw()
for a in f.axes:
texts = [t for t in a.texts]
adjust_text(texts,ax=a, arrowprops=dict(arrowstyle='-', color='grey'))
plotnine.options.figure_size = get_figsize(1.5)
f = (ggplot(aes(x='IC', y='total_contrib', color='length', label='pattern_name'), dfp) +
geom_point() +
geom_text(size=7) +
theme_classic()
)
f = f.draw()
for a in f.axes:
texts = [t for t in a.texts]
adjust_text(texts,ax=a, arrowprops=dict(arrowstyle='-', color='grey'))
import hvplot.pandas
dfp.hvplot(x='IC', y='contrib', kind='scatter', value_label='pattern_name', alpha=0.7, persist=False)
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='IC', y='total_contrib', color='length'), dfp) +
geom_point() +
theme_classic()
)
from scipy.stats import spearmanr
dfp2 = pd.DataFrame([{"contrib": p.contrib[p.attrs['TF']].sum(),
"IC": p.seq_info_content,
"name": p.name,
"corr": spearmanr(p._get_track('seq_ic').ravel(), p.contrib[p.attrs['TF']].ravel())[0],
"log_n_seqlets": np.log10(p.attrs['n_seqlets']),
'is_te': p.seq_info_content > 30}
for p in patterns])
dfp2['contrib-per-IC'] = dfp2['contrib'] / dfp2['IC']
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='IC', y='contrib', color='is_te'), dfp2) +
geom_point() +
theme_classic()
)
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='corr', y='contrib-per-IC', color='is_te'), dfp2) +
geom_point() +
theme_classic()
)
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='IC', y='contrib', color='is_te'), dfp2) +
geom_point() +
theme_classic()
)
plotnine.options.figure_size = get_figsize(0.4)
(ggplot(aes(x='IC', y='contrib', color='log_n_seqlets'), dfp2) +
geom_point() +
theme_classic()
)
dfp.head()
dfi_top = dfi.drop_duplicates()
dfi_top['n_repeat_name'] = dfi_top.groupby(['pattern_name', 'repeat_name']).repeat_name.transform('size')
dfi_top['n_repeat_frac'] = dfi_top['n_repeat_name'] / dfi_top['n_pattern']
dfi_top1 = dfi_top.groupby(['pattern_name']).apply(lambda x: x.loc[x.n_repeat_frac.idxmax()])
del dfi_top1['chrom']
del dfi_top1['start']
del dfi_top1['end']
del dfi_top1['interval']
del dfi_top1['pattern_name']
len(dfi_top1)
long_patterns = [p for p in patterns_long_name if p.seq_info_content >= 30]
long_patterns_by_name = {p.name: p for p in long_patterns}
long_patterns_clustered = cluster_align_patterns(long_patterns, n_clusters=len(patterns_long_name)//2, trials=20, max_shift=20)
df_cp = pd.read_excel(f"{fdir}/../overlap_cwm_pwm.xlsx")
df_cp.is_te == 1
df_cp['pattern_name'] = df_cp['task'] + "/" + df_cp['pattern']
df_cp.set_index('pattern_name', inplace=True)
dfp = df_cp.loc[[p.name for p in long_patterns_clustered]]
print(dfp[['n_pwm_gw', 'n_pwm_peaks', 'n_cwm_peaks', 'n_cwm_and_pwm_peaks', 'n_seqlets']].to_string())
df_info = get_df_info(long_patterns_clustered, tasks)
df_info.head()
dft = dfi_top1.loc[[p.name for p in long_patterns_clustered]]
print(dft.to_string())
# clustered version of the plot
fig, axes = plt.subplots(len(long_patterns), 1, figsize=get_figsize(.5, aspect=1.5))
# gridspec_kw=dict(wspace=10))
# fig, axes = plt.subplots(2, 7, figsize=get_figsize(2, aspect=1/2))
# fig.subplots_adjust(hspace=0, wspace=2)
for i, p in enumerate(long_patterns_clustered):
#p = pl[pname]
# if p.name in main_patterns:
# continue
# Motif logo
ltr_name = dft.loc[p.name].repeat_name
ltr_frac = dft.loc[p.name].LTR_overlap_frac
ax = axes[i]
# Text columns before
seqlogo(p.get_seq_ic(), ax=ax)
ax.set_ylim([0, 2]) # all plots have the same shape
strip_axis(ax)
ax.axison = False
if i == 0:
ax.set_title("Sequence information content")
task, mc, pl = p.name.split("/")
pname = task + "/" + pl.split('_')[1]
# text in before of the motif
ax.text(-25, 0, pname, fontsize=8, horizontalalignment='right')
ax.text(-15, 0, str(p.attrs['n_seqlets']), fontsize=8, horizontalalignment='right')
ax.text(-5, 0, int(dfp.loc[p.name].n_cwm_peaks), fontsize=8, horizontalalignment='right')
# text after the motif
ax.text(75, 0, f"{ltr_frac:.2f}", fontsize=8, horizontalalignment='left')
ax.text(85, 0, ltr_name, fontsize=8, horizontalalignment='left')
fig.savefig(f'{fdir}/pattern-table.all.pdf')
main_patterns = dft.groupby("repeat_name").LTR_overlap_frac.idxmax().values
dft.reset_index().to_csv(f"{fdir}/long_patterns.tsv", index=False, sep='\t')
dfi_top1
dfi_top1.LTR_overlap_frac.plot.hist(30);
dfi_top1.reset_index().to_excel(f"{fdir}/../repeat-masker-overlap.xlsx", index=False)
dfi_top1
p = long_patterns[0]
p.name
def shorten_pname(name):
"""'Oct4/metacluster_0/pattern_9' -> Oct4.p0
"""
tf, mc, pn = name.split("/")
pn_id = pn.replace("pattern_", "")
return f"{tf}.p{pn_id}"
for p in long_patterns:
ymax = max([v.max() for v in p.contrib.values()])
ymin = min([v.min() for v in p.contrib.values()])
ltr_name = dft.loc[p.name].repeat_name
ltr_frac = dft.loc[p.name].LTR_overlap_frac
# Plot the other factor
long_patterns_by_name[p.name].plot(['seq_ic', 'contrib'],
rotate_y=0,
letter_width=0.1,
height=0.4,
ylim=[(0, 2)] + [(ymin, ymax)]*4,
title=f"{p.name}, LTR: {ltr_name}, any LTR frac: {ltr_frac:.2f}");
sns.despine(top=True, right=True, left=False, bottom=True)
out_name = shorten_pname(p.name) + "." + ltr_name
plt.savefig(f"{fdir_individual}/{out_name}.contrib.pdf")
import collections
from basepair.plot.tracks import *
def plot_track(arr, ax, legend=False, ylim=None, color=None):
"""Plot a track
"""
seqlen = len(arr)
if arr.ndim == 1 or arr.shape[1] == 1:
# single track
if color is not None:
if isinstance(color, collections.Sequence):
color = color[0]
ax.plot(np.arange(1, seqlen + 1), np.ravel(arr), color=color)
elif arr.shape[1] == 4:
# plot seqlogo
seqlogo(arr, ax=ax)
elif arr.shape[1] == 2:
# plot both strands
if color is not None:
assert isinstance(color, collections.Sequence)
c1 = color[0]
c2 = color[1]
else:
c1, c2 = None, None
ax.plot(np.arange(1, seqlen + 1), arr[:, 0], label='pos', color=c1)
ax.plot(np.arange(1, seqlen + 1), arr[:, 1], label='neg', color=c2)
if legend:
ax.legend()
else:
raise ValueError(f"Don't know how to plot array with shape[1] != {arr.shape[1]}. Valid values are: 1,2 or 4.")
if ylim is not None:
ax.set_ylim(ylim)
def plot_tracks(tracks, seqlets=[], title=None, rotate_y=90, legend=False, fig_width=20,
fig_height_per_track=2, ylim=None, same_ylim=False, ylab=True, color=None):
"""Plot a multiple tracks.
One-hot-encoded sequence as a logo,
and 1 or 2 dim tracks as normal line-plots.
Args:
tracks: dictionary of numpy arrays with the same axis0 length
fig_width: figure width
fig_height_per_track: figure height per track.
ylim: if not None, a single tuple or a list of tuples representing the ylim to use
Returns:
matplotlib.figure.Figure
"""
tracks = skip_nan_tracks(tracks) # ignore None values
fig, axes = plt.subplots(len(tracks), 1,
figsize=(fig_width, fig_height_per_track * len(tracks)),
sharex=True)
if len(tracks) == 1:
axes = [axes]
if same_ylim:
ylim = (0, max([v.max() for k,v in get_items(tracks)]))
for i, (ax, (track, arr)) in enumerate(zip(axes, get_items(tracks))):
plot_track(arr, ax, legend, get_list_value(ylim, i),
color=get_list_value(color, i))
for seqlet in seqlets:
plot_seqlet(seqlet, ax, add_label=i == 0)
# ax.set_ylabel(track)
if ylab:
ax.set_ylabel(track, rotation=rotate_y,
multialignment='center',
ha='right', labelpad=5)
simple_yaxis_format(ax)
if i != len(tracks) - 1:
ax.xaxis.set_ticks_position('none')
if i == 0 and title is not None:
ax.set_title(title)
# if seqlets:
# pass
# add ticks to the final axis
ax.xaxis.set_major_locator(ticker.AutoLocator())
# ax.xaxis.set_major_locator(ticker.MaxNLocator(4))
# spaced_xticks(ax, spacing=5)
fig.subplots_adjust(hspace=0)
#fig.subplots_adjust(left=0, right=1, top=1, bottom=0)
# cleanup the plot
return fig
def get_tracks(p, tasks):
def flip_neg(x):
x = x.copy()
if x.shape[1] == 2:
x[:,1] = -x[:,1]
return x
tracks = OrderedDict([(kind, p._get_track(kind)) for kind in ['profile', 'seq_ic', 'contrib']])
tracks = [('seq_ic', tracks['seq_ic'])] + [
(task, flip_neg(tracks[track][task]))
for task in tasks
for track in ['profile', 'contrib']
]
return tracks
pmax_dict = {t: max([p.profile[t].max() for p in long_patterns_clustered
if p.name != 'Klf4/metacluster_0/pattern_18'])
for t in tasks}
ylim_profiles = {t: (0, pmax_dict[t]) for t in p.tasks()}
ymax = max([v.max() for v in p.contrib.values()])
ymin = min([v.min() for v in p.contrib.values()])
ylim_profiles
# ylim=ylim_profiles + [(0, 2)] + [(ymin, ymax)]*4
letter_width=0.1
height=0.4
rotate_y=0
colors = [None]
for task in tasks:
colors.append((tf_colors[task], tf_colors[task] + "80")) # 80 add alpha=0.5
colors.append(None)
pmax_dict = {t: max([p.profile[t].max() for p in long_patterns_clustered
if p.name != 'Oct4/metacluster_0/pattern_14'])
for t in tasks}
ylim_profiles = [(0, pmax_dict[t]) for t in p.tasks()]
for p in long_patterns:
ymax = max([v.max() for v in p.contrib.values()])
ymin = min([v.min() for v in p.contrib.values()])
ltr_name = dft.loc[p.name].repeat_name
ltr_frac = dft.loc[p.name].LTR_overlap_frac
pymax = max([v.max() for v in p.profile.values()])
pymin = min([v.min() for v in p.profile.values()])
ylim_profiles = [(0, pmax_dict[t]) for t in p.tasks()]
tracks = get_tracks(long_patterns_by_name[p.name], tasks)
w,h = get_figsize(0.6, 1)
plot_tracks(tracks,
# title=self.name,
rotate_y=rotate_y,
fig_width=w,
fig_height_per_track=h / 9,
color=colors,
ylim=[(0, 2)] + [(-pmax_dict[task], pmax_dict[task]) if which == 'profile' else (ymin, ymax)
for task in tasks for which in ['profile', 'imp']],
# height_ratios=[1] + [x for i in range(len(tasks)) for x in [1.2, 0.8]],
title=f"{p.name}, LTR: {ltr_name}, any LTR frac: {ltr_frac:.2f}",
ylab=ylab);
sns.despine(top=True, right=True, left=False, bottom=True)
out_name = shorten_pname(p.name) + "." + ltr_name
plt.savefig(f"{fdir_individual}/{out_name}.contrib-w-profiles.pdf")
from basepair.plot.heatmaps import heatmap_sequence
from genomelake.extractors import FastaExtractor
from basepair.preproc import resize_interval, interval_center
from basepair.exp.te.dist import sort_seqs_kimura, consensus_dist_kimura
dfi_cols = ['chrom', 'start', 'end', 'interval_from_task', 'seqname', 'strand']
# fe = FastaExtractor(fasta_file, use_strand=True)
# for p in long_patterns:
# dfi = p.attrs['stacked_seqlet_imp'].dfi[dfi_cols]
# seqs = fe([resize_interval(x, 200) for x in BedTool.from_dataframe(dfi)])
# fig = heatmap_sequence(sort_seqs_kimura(seqs), aspect=0.5, figsize_tmpl=get_figsize(.5, 20), cbar=False)
# # Add the title
# ltr_name = dft.loc[p.name].repeat_name
# ltr_frac = dft.loc[p.name].LTR_overlap_frac
# plt.title(f"{p.name}, \nLTR: {ltr_name}, any LTR frac: {ltr_frac:.2f}");
# out_name = shorten_pname(p.name) + "." + ltr_name
# plt.savefig(f"{fdir_individual}/{out_name}.seq.pdf")
# plt.savefig(f"{fdir_individual}/{out_name}.seq.png")
ranges = isf.get_ranges()
ranges.columns = ['example_' + c for c in ranges.columns]
dfi = p.attrs['stacked_seqlet_imp'].dfi
profile_width = 200
for p in long_patterns:
# Get LTR names
ltr_name = dft.loc[p.name].repeat_name
ltr_frac = dft.loc[p.name].LTR_overlap_frac
# Extract stacked seqlets again
dfi = p.attrs['stacked_seqlet_imp'].dfi.copy()
# adopt dfi to the right format
dfi = resize_interval(dfi, width=profile_width)
dfi['center'] = interval_center(dfi)
task = p.name.split("/")[0]
ranges_subset = ranges[ranges.example_interval_from_task == task]
ranges_subset['example_idx_new'] = np.arange(len(ranges_subset))
dfi = pd.merge(dfi, ranges_subset, left_on='seqname', right_on='example_idx_new')
assert np.all(dfi.example_chrom == dfi.chrom)
assert np.all(dfi.center > dfi.example_start)
assert np.all(dfi.center < dfi.example_end)
dfi['pattern_center'] = interval_center(dfi) - dfi['example_start']
dfi['pattern_start'] = dfi['start'] - dfi['example_start']
dfi['pattern_end'] = dfi['end'] - dfi['example_start']
simp = isf.extract_dfi(dfi, profile_width=profile_width)
# Re-sort the sequence + profiles according to the kimura distance
simp = simp[np.argsort(consensus_dist_kimura(simp.seq))]
# Plot sequence
fig = simp.plot("seq", aspect=0.5, figsize_tmpl=get_figsize(.5, 20), cbar=False)
plt.title(f"{p.name}, \nLTR: {ltr_name}, any LTR frac: {ltr_frac:.2f}");
out_name = shorten_pname(p.name) + "." + ltr_name
fig.savefig(f"{fdir_individual}/{out_name}.heatmap-seq.pdf", raster=True)
# Plot profile
fig = simp.plot("profile", figsize=get_figsize(2, 1))
fig.savefig(f"{fdir_individual}/{out_name}.heatmap-profile.pdf", raster=True)
# Save the color-bar
from concise.preprocessing import encodeDNA
fig = heatmap_sequence(encodeDNA(["A"*100]*200), aspect=1, figsize_tmpl=get_figsize(1, 5), cbar=True);
fig.savefig(f"{fdir}/colorbar.pdf")
pl = {p.name: p for p in patterns_long_name}
df_info = get_df_info(long_patterns_clustered, tasks)
p = long_patterns[0]
p.attrs['n_seqlets']
long_patterns[0].name
# fp_slice = slice(25, 175)
fp_slice = slice(10, 190)
max_vals = {t: df_info.max()[t + "_max"] for t in tasks}
fig, axes = plt.subplots(len(long_patterns_clustered), 8, figsize=get_figsize(1, aspect=1))
# fig, axes = plt.subplots(2, 7, figsize=get_figsize(2, aspect=1/2))
fig.subplots_adjust(hspace=0, wspace=0)
# Get the ylim for each TF
contrib_ylim = {tf: (min([p.contrib[p.attrs['TF']].min() for p in long_patterns_clustered if p.attrs['TF'] == tf] + [0]),
max([p.contrib[p.attrs['TF']].max() for p in long_patterns_clustered if p.attrs['TF'] == tf] + [0]))
for tf in tasks}
for i, p in enumerate(long_patterns_clustered):
#p = pl[pname]
# if p.name in main_patterns:
# continue
# Motif logo
ltr_name = dft.loc[p.name].repeat_name
ltr_frac = dft.loc[p.name].LTR_overlap_frac
ax = axes[i, 0]
# Text columns before
ax.text(-1, 0, p.attrs["TF"] + "\n" + str(p.attrs["n_seqlets"]), fontsize=8, horizontalalignment='right')
seqlogo(p.get_seq_ic(), ax=ax)
ax.set_ylim([0, 2]) # all plots have the same shape
strip_axis(ax)
ax.axison = False
pos1 = ax.get_position() # get the original position
pos2 = [pos1.x0, pos1.y0 + pos1.height * 0.4, pos1.width *3, pos1.height * .5]
ax.set_position(pos2) # set a new position
if i == 0:
ax.set_title("Sequence\ninfo. content")
# TOOD
ax = axes[i, 1]
blank_ax(ax)
ax = axes[i, 2]
blank_ax(ax)
ax = axes[i, 3]
blank_ax(ax)
ax.text(1, 0, ltr_name + f"\n{ltr_frac:.2f}", fontsize=8, horizontalalignment='right')
# Profile columns
for j, task in enumerate(tasks):
ax = axes[i, 4 + j]
fp = p.profile[task]
ax.plot(fp[fp_slice, 0], color=tf_colors[task])
ax.plot(-fp[fp_slice, 1], color=tf_colors[task], alpha=0.5) # negative strand
ax.set_ylim([-max_vals[task], max_vals[task]]) # all plots have the same shape
strip_axis(ax)
ax.axison = False
if i == 0:
ax.set_title(task)
fig.savefig(f'{fdir}/pattern-table.all.pdf')