p.attrs.keys()

dict_keys(['TF', 'n_seqlets', 'features', 'align', 'cluster'])

dft

patterns_by_repeat = {k: list(v['pattern_name']) for k,v in dft.reset_index().groupby("repeat_name")}

patterns_by_repeat["MMERVK9C_I-int"]

['Nanog/metacluster_0/pattern_9',
 'Sox2/metacluster_0/pattern_6',
 'Oct4/metacluster_0/pattern_4']

"MMERVK9C_I-int":

for p in long_patterns_clustered[-8:-6]:
    p.plot("seq_ic")

for p in long_patterns_clustered[-3:]:
    p.plot("seq_ic")

long_patterns_by_name[pname].attrs['stacked_seqlet_imp'].dfi

StackedSeqletImp.__add__

<function __main__.StackedSeqletImp.__add__(self, stacked_seqlets)>

stacked_seqlets = StackedSeqletImp.concat(
    [patterns_by_name[pname].attrs['stacked_seqlet_imp']
     for pname in patterns_by_repeat["MMERVK9C_I-int"]]
)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-85-54abd224e84e> in <module>
----> 1 stacked_seqlets = StackedSeqletImp.concat(
      2     [patterns_by_name[pname].attrs['stacked_seqlet_imp']
      3      for pname in patterns_by_repeat["MMERVK9C_I-int"]]
      4 )

NameError: name 'StackedSeqletImp' is not defined

pgroup = [patterns_by_name[pname] for pname in patterns_by_repeat["MMERVK9C_I-int"]]

p1 = pgroup[0]
p2 = pgroup[1]
p3 = pgroup[1]

p1 = pgroup[0]
dfi_list = [p1.attrs['stacked_seqlet_imp']].dfi
for po in pgroup[1:]:
    dfi = po.attrs['stacked_seqlet_imp'].dfi
    poa = po.align(p1)
    strand_vec = dfi.strand.map({"+": 1, "-": -1})

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-89-3ef7e29659cf> in <module>
      1 p1 = pgroup[0]
----> 2 dfi_list = [p1.attrs['stacked_seqlet_imp']].dfi
      3 for po in pgroup[1:]:
      4     dfi = po.attrs['stacked_seqlet_imp'].dfi
      5     poa = po.align(p1)

AttributeError: 'list' object has no attribute 'dfi'

p1.attrs

{'TF': 'Nanog',
 'n_seqlets': 186,
 'stacked_seqlet_imp': <basepair.modisco.core.StackedSeqletImp at 0x7fb75e7bc630>,
 'features': {'n seqlets': 186}}

p1a = p1.align(p2)

p2a = p2.align(p1)
p2a

p3a = p3.align(p1)

# NOTE -> offset + means that we have to subtract form the intervals (if on the + strand after flipping it):
# start - offset * (1 if strand == "+" else -1)
# end - offset * (1 if strand == "+" else -1)

p3a.attrs['align']

{'use_rc': True, 'offset': 2}

p2a.attrs['align']

{'use_rc': True, 'offset': 2}

p2.plot('seq_ic')
p2.rc().plot('seq_ic')
p1.plot('seq_ic')
p2a.plot('seq_ic');

p1.plot('seq_ic')
p1.rc().plot('seq_ic')
p2.plot('seq_ic')
p1a.plot('seq_ic');

p1a.attrs['']

{'TF': 'Nanog',
 'n_seqlets': 186,
 'stacked_seqlet_imp': <basepair.modisco.core.StackedSeqletImp at 0x7fb75e316eb8>,
 'features': {'n seqlets': 186},
 'align': {'use_rc': True, 'offset': 2}}

# TODO - pairwise align these locations...

TODO¶

get seqlet locations for each
- store as dfi

# fp_slice = slice(25, 175)
fp_slice = slice(10, 190)

max_vals = {t: df_info.max()[t + "_max"] for t in tasks}

fig, axes = plt.subplots(len(main_patterns), 8, figsize=get_figsize(1, aspect=1/2))
# fig, axes = plt.subplots(2, 7, figsize=get_figsize(2, aspect=1/2))
fig.subplots_adjust(hspace=0, wspace=0)

# Get the ylim for each TF
contrib_ylim = {tf: (min([p.contrib[p.attrs['TF']].min() for p in long_patterns_clustered if p.attrs['TF'] == tf] + [0]),
                     max([p.contrib[p.attrs['TF']].max() for p in long_patterns_clustered if p.attrs['TF'] == tf] + [0]))
               for tf in tasks}

for i, pname in enumerate(main_patterns):
    p = pl[pname]
     
#     if p.name in main_patterns:
#         continue
    # Motif logo
    
    ltr_name = dft.loc[p.name].repeat_name
    ltr_frac = dft.loc[p.name].LTR_overlap_frac
    
    
    ax = axes[i, 0]
    # Text columns before
    ax.text(-1, 0, p.attrs["TF"] + "\n" + str(p.attrs["n_seqlets"]), fontsize=8, horizontalalignment='right')
    seqlogo(p.get_seq_ic(), ax=ax)
    ax.set_ylim([0, 2])  # all plots have the same shape
    strip_axis(ax)
    ax.axison = False
    pos1 = ax.get_position() # get the original position 
    pos2 = [pos1.x0, pos1.y0 + pos1.height * 0.4,  pos1.width *3, pos1.height * .5]
    ax.set_position(pos2) # set a new position
    if i == 0:
        ax.set_title("Sequence\ninfo. content")

    # TOOD 
    ax = axes[i, 1]
    blank_ax(ax)
    ax = axes[i, 2]
    blank_ax(ax)
    ax = axes[i, 3]
    blank_ax(ax)
    ax.text(1, 0, ltr_name + f"\n{ltr_frac:.2f}", fontsize=8, horizontalalignment='right')
    
    # Profile columns
    for j, task in enumerate(tasks):
        ax = axes[i, 4 + j]
        fp = p.profile[task]
        
        ax.plot(fp[fp_slice, 0], color=tf_colors[task])
        ax.plot(-fp[fp_slice, 1], color=tf_colors[task], alpha=0.5)  # negative strand
        
        ax.set_ylim([-max_vals[task], max_vals[task]])  # all plots have the same shape
        strip_axis(ax)
        ax.axison = False
        
        if i == 0:
            ax.set_title(task)

fig.savefig(f'{fdir}/pattern-table.main.pdf')

print(dft.groupby('repeat_name').n_pattern.sum().to_string())

repeat_name
(AGGGTT)n          106
BGLII              118
ID_B1              592
MMERVK9C_I-int     527
ORR1A2             670
ORR1C2             674
RLTR13B1           442
RLTR13D6           811
RLTR9E            1635
RMER10A            107

mr = mr_dict[0][1]

dfs = mr.seqlet_df_instances()

mr.get_seqlet_intervals('metacluster_0/pattern_0', as_df=True)

def prefix_column(df, column, prefix):
    df[column] = prefix + df[column]
    return df

mr.get_seqlet_intervals("metacluster_0/pattern_0", as_df=True)

dfs = pd.concat([mr.get_seqlet_intervals(pattern, as_df=True).pipe(prefix_column, 'pattern', t + "/")
                 for t,mr in mr_dict
                 for pattern in mr.patterns()])

dft.head()

dfs['pattern_name'] = dfs['pattern']
dfs = dfs.set_index('pattern_name')

# Unique seqlet locations across different positions

dict(dft.join(dfs).groupby('repeat_name').apply(lambda x: len(x[['seqname', 'start', 'end']].drop_duplicates())))

{'(AGGGTT)n': 106,
 'BGLII': 118,
 'ID_B1': 602,
 'MMERVK9C_I-int': 529,
 'ORR1A2': 672,
 'ORR1C2': 675,
 'RLTR13B1': 454,
 'RLTR13D6': 811,
 'RLTR9E': 1639,
 'RMER10A': 107}

list(main_patterns)

['Klf4/metacluster_0/pattern_18',
 'Nanog/metacluster_0/pattern_14',
 'Klf4/metacluster_0/pattern_5',
 'Sox2/metacluster_0/pattern_6',
 'Klf4/metacluster_0/pattern_12',
 'Sox2/metacluster_0/pattern_5',
 'Klf4/metacluster_0/pattern_16',
 'Nanog/metacluster_0/pattern_8',
 'Klf4/metacluster_0/pattern_9',
 'Sox2/metacluster_0/pattern_8']

# main patterns
dict(dft.groupby("repeat_name").LTR_overlap_frac.idxmax())

{'(AGGGTT)n': 'Klf4/metacluster_0/pattern_18',
 'BGLII': 'Nanog/metacluster_0/pattern_14',
 'ID_B1': 'Klf4/metacluster_0/pattern_5',
 'MMERVK9C_I-int': 'Sox2/metacluster_0/pattern_6',
 'ORR1A2': 'Klf4/metacluster_0/pattern_12',
 'ORR1C2': 'Sox2/metacluster_0/pattern_5',
 'RLTR13B1': 'Klf4/metacluster_0/pattern_16',
 'RLTR13D6': 'Nanog/metacluster_0/pattern_8',
 'RLTR9E': 'Klf4/metacluster_0/pattern_9',
 'RMER10A': 'Sox2/metacluster_0/pattern_8'}

fig.savefig(f'{fdir}/pattern-table.all.pdf')

	n_pattern	repeat_name	repeat_family	LTR_overlap_frac	n_repeat_name	n_repeat_frac
pattern_name
Nanog/metacluster_0/pattern_7	193	RLTR9E	LTR/ERVK	0.9171	84	0.4352
Nanog/metacluster_0/pattern_6	274	RLTR9E	LTR/ERVK	0.8285	98	0.3577
Klf4/metacluster_0/pattern_9	346	RLTR9E	LTR/ERVK	0.9884	235	0.6792
...	...	...	...	...	...	...
Nanog/metacluster_0/pattern_9	186	MMERVK9C_I-int	LTR/ERVK	0.7903	69	0.3710
Sox2/metacluster_0/pattern_6	118	MMERVK9C_I-int	LTR/ERVK	0.9153	66	0.5593
Oct4/metacluster_0/pattern_4	223	MMERVK9C_I-int	LTR/ERVK	0.9013	110	0.4933

	chrom	start	end	interval_from_task	seqname	strand	pattern
0	chr5	28210007	28210077	Oct4	10	+	metacluster_0/pattern_0
1	chr17	35504019	35504089	Oct4	6	+	metacluster_0/pattern_0
2	chr13	51491821	51491891	Oct4	412	+	metacluster_0/pattern_0
...	...	...	...	...	...	...	...
5427	chr2	31469893	31469963	Oct4	14565	+	metacluster_0/pattern_0
5428	chr11	78401105	78401175	Oct4	806	-	metacluster_0/pattern_0
5429	chr13	85922832	85922902	Oct4	12787	+	metacluster_0/pattern_0

	chrom	start	end	interval_from_task	seqname	strand	pattern
0	chr5	28210007	28210077	Oct4	10	+	metacluster_0/pattern_0
1	chr17	35504019	35504089	Oct4	6	+	metacluster_0/pattern_0
2	chr13	51491821	51491891	Oct4	412	+	metacluster_0/pattern_0
...	...	...	...	...	...	...	...
5427	chr2	31469893	31469963	Oct4	14565	+	metacluster_0/pattern_0
5428	chr11	78401105	78401175	Oct4	806	-	metacluster_0/pattern_0
5429	chr13	85922832	85922902	Oct4	12787	+	metacluster_0/pattern_0

	n_pattern	repeat_name	repeat_family	LTR_overlap_frac	n_repeat_name	n_repeat_frac
pattern_name
Nanog/metacluster_0/pattern_7	193	RLTR9E	LTR/ERVK	0.9171	84	0.4352
Nanog/metacluster_0/pattern_6	274	RLTR9E	LTR/ERVK	0.8285	98	0.3577
Klf4/metacluster_0/pattern_9	346	RLTR9E	LTR/ERVK	0.9884	235	0.6792
Klf4/metacluster_0/pattern_18	106	(AGGGTT)n	Simple_repeat	0.9151	17	0.1604
Klf4/metacluster_0/pattern_16	119	RLTR13B1	LTR/ERVK	1.0000	76	0.6387