In [ ]:
 
In [276]:
p.attrs.keys()
Out[276]:
dict_keys(['TF', 'n_seqlets', 'features', 'align', 'cluster'])
In [326]:
dft
Out[326]:
n_pattern repeat_name repeat_family LTR_overlap_frac n_repeat_name n_repeat_frac
pattern_name
Nanog/metacluster_0/pattern_7 193 RLTR9E LTR/ERVK 0.9171 84 0.4352
Nanog/metacluster_0/pattern_6 274 RLTR9E LTR/ERVK 0.8285 98 0.3577
Klf4/metacluster_0/pattern_9 346 RLTR9E LTR/ERVK 0.9884 235 0.6792
... ... ... ... ... ... ...
Nanog/metacluster_0/pattern_9 186 MMERVK9C_I-int LTR/ERVK 0.7903 69 0.3710
Sox2/metacluster_0/pattern_6 118 MMERVK9C_I-int LTR/ERVK 0.9153 66 0.5593
Oct4/metacluster_0/pattern_4 223 MMERVK9C_I-int LTR/ERVK 0.9013 110 0.4933

21 rows × 6 columns

In [84]:
patterns_by_repeat = {k: list(v['pattern_name']) for k,v in dft.reset_index().groupby("repeat_name")}
In [339]:
patterns_by_repeat["MMERVK9C_I-int"]
Out[339]:
['Nanog/metacluster_0/pattern_9',
 'Sox2/metacluster_0/pattern_6',
 'Oct4/metacluster_0/pattern_4']
In [ ]:
"MMERVK9C_I-int": 
In [273]:
for p in long_patterns_clustered[-8:-6]:
    p.plot("seq_ic")
In [272]:
for p in long_patterns_clustered[-3:]:
    p.plot("seq_ic")
In [363]:
long_patterns_by_name[pname].attrs['stacked_seqlet_imp'].dfi
In [389]:
StackedSeqletImp.__add__
Out[389]:
<function __main__.StackedSeqletImp.__add__(self, stacked_seqlets)>
In [85]:
stacked_seqlets = StackedSeqletImp.concat(
    [patterns_by_name[pname].attrs['stacked_seqlet_imp']
     for pname in patterns_by_repeat["MMERVK9C_I-int"]]
)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-85-54abd224e84e> in <module>
----> 1 stacked_seqlets = StackedSeqletImp.concat(
      2     [patterns_by_name[pname].attrs['stacked_seqlet_imp']
      3      for pname in patterns_by_repeat["MMERVK9C_I-int"]]
      4 )

NameError: name 'StackedSeqletImp' is not defined
In [87]:
pgroup = [patterns_by_name[pname] for pname in patterns_by_repeat["MMERVK9C_I-int"]]
In [88]:
p1 = pgroup[0]
p2 = pgroup[1]
p3 = pgroup[1]
In [89]:
p1 = pgroup[0]
dfi_list = [p1.attrs['stacked_seqlet_imp']].dfi
for po in pgroup[1:]:
    dfi = po.attrs['stacked_seqlet_imp'].dfi
    poa = po.align(p1)
    strand_vec = dfi.strand.map({"+": 1, "-": -1})
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-89-3ef7e29659cf> in <module>
      1 p1 = pgroup[0]
----> 2 dfi_list = [p1.attrs['stacked_seqlet_imp']].dfi
      3 for po in pgroup[1:]:
      4     dfi = po.attrs['stacked_seqlet_imp'].dfi
      5     poa = po.align(p1)

AttributeError: 'list' object has no attribute 'dfi'
In [412]:
p1.attrs
Out[412]:
{'TF': 'Nanog',
 'n_seqlets': 186,
 'stacked_seqlet_imp': <basepair.modisco.core.StackedSeqletImp at 0x7fb75e7bc630>,
 'features': {'n seqlets': 186}}
In [397]:
p1a = p1.align(p2)
In [405]:
p2a = p2.align(p1)
p2a
In [409]:
p3a = p3.align(p1)
In [411]:
# NOTE -> offset + means that we have to subtract form the intervals (if on the + strand after flipping it):
# start - offset * (1 if strand == "+" else -1)
# end - offset * (1 if strand == "+" else -1)
In [410]:
p3a.attrs['align']
Out[410]:
{'use_rc': True, 'offset': 2}
In [406]:
p2a.attrs['align']
Out[406]:
{'use_rc': True, 'offset': 2}
In [407]:
p2.plot('seq_ic')
p2.rc().plot('seq_ic')
p1.plot('seq_ic')
p2a.plot('seq_ic');
In [404]:
p1.plot('seq_ic')
p1.rc().plot('seq_ic')
p2.plot('seq_ic')
p1a.plot('seq_ic');
In [399]:
p1a.attrs['']
Out[399]:
{'TF': 'Nanog',
 'n_seqlets': 186,
 'stacked_seqlet_imp': <basepair.modisco.core.StackedSeqletImp at 0x7fb75e316eb8>,
 'features': {'n seqlets': 186},
 'align': {'use_rc': True, 'offset': 2}}
In [393]:
# TODO - pairwise align these locations...

TODO

  • get seqlet locations for each
    • store as dfi
In [90]:
# fp_slice = slice(25, 175)
fp_slice = slice(10, 190)

max_vals = {t: df_info.max()[t + "_max"] for t in tasks}

fig, axes = plt.subplots(len(main_patterns), 8, figsize=get_figsize(1, aspect=1/2))
# fig, axes = plt.subplots(2, 7, figsize=get_figsize(2, aspect=1/2))
fig.subplots_adjust(hspace=0, wspace=0)

# Get the ylim for each TF
contrib_ylim = {tf: (min([p.contrib[p.attrs['TF']].min() for p in long_patterns_clustered if p.attrs['TF'] == tf] + [0]),
                     max([p.contrib[p.attrs['TF']].max() for p in long_patterns_clustered if p.attrs['TF'] == tf] + [0]))
               for tf in tasks}

for i, pname in enumerate(main_patterns):
    p = pl[pname]
     
#     if p.name in main_patterns:
#         continue
    # Motif logo
    
    ltr_name = dft.loc[p.name].repeat_name
    ltr_frac = dft.loc[p.name].LTR_overlap_frac
    
    
    ax = axes[i, 0]
    # Text columns before
    ax.text(-1, 0, p.attrs["TF"] + "\n" + str(p.attrs["n_seqlets"]), fontsize=8, horizontalalignment='right')
    seqlogo(p.get_seq_ic(), ax=ax)
    ax.set_ylim([0, 2])  # all plots have the same shape
    strip_axis(ax)
    ax.axison = False
    pos1 = ax.get_position() # get the original position 
    pos2 = [pos1.x0, pos1.y0 + pos1.height * 0.4,  pos1.width *3, pos1.height * .5]
    ax.set_position(pos2) # set a new position
    if i == 0:
        ax.set_title("Sequence\ninfo. content")

    # TOOD 
    ax = axes[i, 1]
    blank_ax(ax)
    ax = axes[i, 2]
    blank_ax(ax)
    ax = axes[i, 3]
    blank_ax(ax)
    ax.text(1, 0, ltr_name + f"\n{ltr_frac:.2f}", fontsize=8, horizontalalignment='right')
    
    # Profile columns
    for j, task in enumerate(tasks):
        ax = axes[i, 4 + j]
        fp = p.profile[task]
        
        ax.plot(fp[fp_slice, 0], color=tf_colors[task])
        ax.plot(-fp[fp_slice, 1], color=tf_colors[task], alpha=0.5)  # negative strand
        
        ax.set_ylim([-max_vals[task], max_vals[task]])  # all plots have the same shape
        strip_axis(ax)
        ax.axison = False
        
        if i == 0:
            ax.set_title(task)

fig.savefig(f'{fdir}/pattern-table.main.pdf')
In [114]:
print(dft.groupby('repeat_name').n_pattern.sum().to_string())
repeat_name
(AGGGTT)n          106
BGLII              118
ID_B1              592
MMERVK9C_I-int     527
ORR1A2             670
ORR1C2             674
RLTR13B1           442
RLTR13D6           811
RLTR9E            1635
RMER10A            107
In [120]:
mr = mr_dict[0][1]
In [121]:
dfs = mr.seqlet_df_instances()
In [137]:
mr.get_seqlet_intervals('metacluster_0/pattern_0', as_df=True)
Out[137]:
chrom start end interval_from_task seqname strand pattern
0 chr5 28210007 28210077 Oct4 10 + metacluster_0/pattern_0
1 chr17 35504019 35504089 Oct4 6 + metacluster_0/pattern_0
2 chr13 51491821 51491891 Oct4 412 + metacluster_0/pattern_0
... ... ... ... ... ... ... ...
5427 chr2 31469893 31469963 Oct4 14565 + metacluster_0/pattern_0
5428 chr11 78401105 78401175 Oct4 806 - metacluster_0/pattern_0
5429 chr13 85922832 85922902 Oct4 12787 + metacluster_0/pattern_0

5430 rows × 7 columns

In [125]:
def prefix_column(df, column, prefix):
    df[column] = prefix + df[column]
    return df
In [348]:
mr.get_seqlet_intervals("metacluster_0/pattern_0", as_df=True)
Out[348]:
chrom start end interval_from_task seqname strand pattern
0 chr5 28210007 28210077 Oct4 10 + metacluster_0/pattern_0
1 chr17 35504019 35504089 Oct4 6 + metacluster_0/pattern_0
2 chr13 51491821 51491891 Oct4 412 + metacluster_0/pattern_0
... ... ... ... ... ... ... ...
5427 chr2 31469893 31469963 Oct4 14565 + metacluster_0/pattern_0
5428 chr11 78401105 78401175 Oct4 806 - metacluster_0/pattern_0
5429 chr13 85922832 85922902 Oct4 12787 + metacluster_0/pattern_0

5430 rows × 7 columns

In [138]:
dfs = pd.concat([mr.get_seqlet_intervals(pattern, as_df=True).pipe(prefix_column, 'pattern', t + "/")
                 for t,mr in mr_dict
                 for pattern in mr.patterns()])
In [144]:
dft.head()
Out[144]:
n_pattern repeat_name repeat_family LTR_overlap_frac n_repeat_name n_repeat_frac
pattern_name
Nanog/metacluster_0/pattern_7 193 RLTR9E LTR/ERVK 0.9171 84 0.4352
Nanog/metacluster_0/pattern_6 274 RLTR9E LTR/ERVK 0.8285 98 0.3577
Klf4/metacluster_0/pattern_9 346 RLTR9E LTR/ERVK 0.9884 235 0.6792
Klf4/metacluster_0/pattern_18 106 (AGGGTT)n Simple_repeat 0.9151 17 0.1604
Klf4/metacluster_0/pattern_16 119 RLTR13B1 LTR/ERVK 1.0000 76 0.6387
In [145]:
dfs['pattern_name'] = dfs['pattern']
dfs = dfs.set_index('pattern_name')
In [150]:
# Unique seqlet locations across different positions
In [148]:
dict(dft.join(dfs).groupby('repeat_name').apply(lambda x: len(x[['seqname', 'start', 'end']].drop_duplicates())))
Out[148]:
{'(AGGGTT)n': 106,
 'BGLII': 118,
 'ID_B1': 602,
 'MMERVK9C_I-int': 529,
 'ORR1A2': 672,
 'ORR1C2': 675,
 'RLTR13B1': 454,
 'RLTR13D6': 811,
 'RLTR9E': 1639,
 'RMER10A': 107}
In [102]:
list(main_patterns)
Out[102]:
['Klf4/metacluster_0/pattern_18',
 'Nanog/metacluster_0/pattern_14',
 'Klf4/metacluster_0/pattern_5',
 'Sox2/metacluster_0/pattern_6',
 'Klf4/metacluster_0/pattern_12',
 'Sox2/metacluster_0/pattern_5',
 'Klf4/metacluster_0/pattern_16',
 'Nanog/metacluster_0/pattern_8',
 'Klf4/metacluster_0/pattern_9',
 'Sox2/metacluster_0/pattern_8']
In [153]:
# main patterns
dict(dft.groupby("repeat_name").LTR_overlap_frac.idxmax())
Out[153]:
{'(AGGGTT)n': 'Klf4/metacluster_0/pattern_18',
 'BGLII': 'Nanog/metacluster_0/pattern_14',
 'ID_B1': 'Klf4/metacluster_0/pattern_5',
 'MMERVK9C_I-int': 'Sox2/metacluster_0/pattern_6',
 'ORR1A2': 'Klf4/metacluster_0/pattern_12',
 'ORR1C2': 'Sox2/metacluster_0/pattern_5',
 'RLTR13B1': 'Klf4/metacluster_0/pattern_16',
 'RLTR13D6': 'Nanog/metacluster_0/pattern_8',
 'RLTR9E': 'Klf4/metacluster_0/pattern_9',
 'RMER10A': 'Sox2/metacluster_0/pattern_8'}
In [101]:
fig.savefig(f'{fdir}/pattern-table.all.pdf')