p.attrs.keys()
dft
patterns_by_repeat = {k: list(v['pattern_name']) for k,v in dft.reset_index().groupby("repeat_name")}
patterns_by_repeat["MMERVK9C_I-int"]
"MMERVK9C_I-int":
for p in long_patterns_clustered[-8:-6]:
p.plot("seq_ic")
for p in long_patterns_clustered[-3:]:
p.plot("seq_ic")
long_patterns_by_name[pname].attrs['stacked_seqlet_imp'].dfi
StackedSeqletImp.__add__
stacked_seqlets = StackedSeqletImp.concat(
[patterns_by_name[pname].attrs['stacked_seqlet_imp']
for pname in patterns_by_repeat["MMERVK9C_I-int"]]
)
pgroup = [patterns_by_name[pname] for pname in patterns_by_repeat["MMERVK9C_I-int"]]
p1 = pgroup[0]
p2 = pgroup[1]
p3 = pgroup[1]
p1 = pgroup[0]
dfi_list = [p1.attrs['stacked_seqlet_imp']].dfi
for po in pgroup[1:]:
dfi = po.attrs['stacked_seqlet_imp'].dfi
poa = po.align(p1)
strand_vec = dfi.strand.map({"+": 1, "-": -1})
p1.attrs
p1a = p1.align(p2)
p2a = p2.align(p1)
p2a
p3a = p3.align(p1)
# NOTE -> offset + means that we have to subtract form the intervals (if on the + strand after flipping it):
# start - offset * (1 if strand == "+" else -1)
# end - offset * (1 if strand == "+" else -1)
p3a.attrs['align']
p2a.attrs['align']
p2.plot('seq_ic')
p2.rc().plot('seq_ic')
p1.plot('seq_ic')
p2a.plot('seq_ic');
p1.plot('seq_ic')
p1.rc().plot('seq_ic')
p2.plot('seq_ic')
p1a.plot('seq_ic');
p1a.attrs['']
# TODO - pairwise align these locations...
# fp_slice = slice(25, 175)
fp_slice = slice(10, 190)
max_vals = {t: df_info.max()[t + "_max"] for t in tasks}
fig, axes = plt.subplots(len(main_patterns), 8, figsize=get_figsize(1, aspect=1/2))
# fig, axes = plt.subplots(2, 7, figsize=get_figsize(2, aspect=1/2))
fig.subplots_adjust(hspace=0, wspace=0)
# Get the ylim for each TF
contrib_ylim = {tf: (min([p.contrib[p.attrs['TF']].min() for p in long_patterns_clustered if p.attrs['TF'] == tf] + [0]),
max([p.contrib[p.attrs['TF']].max() for p in long_patterns_clustered if p.attrs['TF'] == tf] + [0]))
for tf in tasks}
for i, pname in enumerate(main_patterns):
p = pl[pname]
# if p.name in main_patterns:
# continue
# Motif logo
ltr_name = dft.loc[p.name].repeat_name
ltr_frac = dft.loc[p.name].LTR_overlap_frac
ax = axes[i, 0]
# Text columns before
ax.text(-1, 0, p.attrs["TF"] + "\n" + str(p.attrs["n_seqlets"]), fontsize=8, horizontalalignment='right')
seqlogo(p.get_seq_ic(), ax=ax)
ax.set_ylim([0, 2]) # all plots have the same shape
strip_axis(ax)
ax.axison = False
pos1 = ax.get_position() # get the original position
pos2 = [pos1.x0, pos1.y0 + pos1.height * 0.4, pos1.width *3, pos1.height * .5]
ax.set_position(pos2) # set a new position
if i == 0:
ax.set_title("Sequence\ninfo. content")
# TOOD
ax = axes[i, 1]
blank_ax(ax)
ax = axes[i, 2]
blank_ax(ax)
ax = axes[i, 3]
blank_ax(ax)
ax.text(1, 0, ltr_name + f"\n{ltr_frac:.2f}", fontsize=8, horizontalalignment='right')
# Profile columns
for j, task in enumerate(tasks):
ax = axes[i, 4 + j]
fp = p.profile[task]
ax.plot(fp[fp_slice, 0], color=tf_colors[task])
ax.plot(-fp[fp_slice, 1], color=tf_colors[task], alpha=0.5) # negative strand
ax.set_ylim([-max_vals[task], max_vals[task]]) # all plots have the same shape
strip_axis(ax)
ax.axison = False
if i == 0:
ax.set_title(task)
fig.savefig(f'{fdir}/pattern-table.main.pdf')
print(dft.groupby('repeat_name').n_pattern.sum().to_string())
mr = mr_dict[0][1]
dfs = mr.seqlet_df_instances()
mr.get_seqlet_intervals('metacluster_0/pattern_0', as_df=True)
def prefix_column(df, column, prefix):
df[column] = prefix + df[column]
return df
mr.get_seqlet_intervals("metacluster_0/pattern_0", as_df=True)
dfs = pd.concat([mr.get_seqlet_intervals(pattern, as_df=True).pipe(prefix_column, 'pattern', t + "/")
for t,mr in mr_dict
for pattern in mr.patterns()])
dft.head()
dfs['pattern_name'] = dfs['pattern']
dfs = dfs.set_index('pattern_name')
# Unique seqlet locations across different positions
dict(dft.join(dfs).groupby('repeat_name').apply(lambda x: len(x[['seqname', 'start', 'end']].drop_duplicates())))
list(main_patterns)
# main patterns
dict(dft.groupby("repeat_name").LTR_overlap_frac.idxmax())
fig.savefig(f'{fdir}/pattern-table.all.pdf')