Goal

  • cluster the motifs per tf
  • match the TE's to the all/deeplift/profile run

Tasks

  • Load patterns from each task
    • add the annotation where they came from
  • merge all together
  • get the affinity matrix
  • cluster them
  • write out a table
In [2]:
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
hv.extension('bokeh')
Using TensorFlow backend.
In [3]:
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dirs = model_dir / f"modisco/by_peak_tasks/per-tf"
tasks = ['Oct4', 'Sox2', 'Nanog', 'Klf4']
In [ ]:
# create_tf_session(0)
In [5]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from uuid import uuid4
from basepair.math import mean
from basepair.stats import perc

from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable, vdom_footprint
from basepair.plot.utils import strip_axis
from basepair.plot.tracks import tidy_motif_plot
from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_short
from basepair.imports import *

paper_config()

Load patterns from each task

  • add the annotation where they came from
In [26]:
patterns[0].name
Out[26]:
'metacluster_0/pattern_0'
In [ ]:
mr
In [37]:
mr = ModiscoResult(modisco_dirs / tf / "modisco.h5")
mr.open()
In [44]:
patterns = []
def rename(p, tf):
    p = p.copy()
    p.name = tf + "_" + shorten_pattern(p.name).split("_")[1]
    return p
for tf in tasks:
    mr = ModiscoResult(modisco_dirs / tf / "modisco.h5")
    mr.open()
    patterns = patterns + [rename(mr.get_pattern(p), tf).add_attr("features", 
                                                                  {"n seqlets": mr.n_seqlets(*p.split("/"))}) 
                           for p in mr.patterns()]
    mr.close()

get the affinity matrix

In [28]:
from basepair.exp.chipnexus.motif_clustering import similarity_matrix
sim_all_seq = similarity_matrix(patterns, track='seq_ic')
100%|██████████| 39/39 [00:06<00:00,  6.79it/s]
In [29]:
fig = plt.figure(figsize=get_figsize(0.25))
iu_all = np.triu_indices(len(sim_all_seq), 1)
plt.hist(sim_all_seq[iu_all], bins=100);
plt.title("Similarity (seq_ic)")
plt.ylabel("Frequency");
plt.xlabel("Similarity between two motifs (all)");
# plt.savefig(f"{figures}/all.similarity.hist.pdf")
# plt.savefig(f"{figures}/all.similarity.hist.png")
In [30]:
from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering, cut_tree, leaves_list
from basepair.modisco.motif_clustering import to_colors

cluster them

In [31]:
iu_all = np.triu_indices(len(sim_all_seq), 1)
lm_all_seq = linkage(1-sim_all_seq[iu_all], 'ward', optimal_ordering=True)
clusters_all_seq = cut_tree(lm_all_seq, n_clusters=9)[:,0]
names = [x.name for x in patterns]
cm_all = sns.clustermap(pd.DataFrame(sim_all_seq, index=names, columns=names),
                        row_linkage=lm_all_seq,
                        col_linkage=lm_all_seq,
                        cmap='Blues',
                        yticklabels=True,
                        figsize=get_figsize(1, aspect=1)
                       ).fig.suptitle('Seq IC sim');
# plt.savefig(f"{figures}/all.heatmap.pdf")
# plt.savefig(f"{figures}/all.heatmap.png")

Align motifs

In [45]:
from basepair.modisco.motif_clustering import create_pattern_table, align_clustered_patterns
np.random.seed(42)
cluster_order = np.argsort(leaves_list(lm_all_seq))
cluster = clusters_all_seq

patterns_all_clustered = align_clustered_patterns(patterns, cluster_order, cluster, 
                                                 align_track='seq_ic',
                                                 metric='continousjaccard',
                                                 # don't shit the major patterns 
                                                 # by more than 15 when aligning 
                                                 trials=20,
                                                 max_shift=15)
100%|██████████| 39/39 [00:00<00:00, 280.36it/s]
In [65]:
def create_pattern_table(patterns,
                         logo_len=30,
                         footprint_width=320,
                         footprint_kwargs=None,
                         seqlogo_kwargs=None,
                         # report_url='results.html'
                         n_jobs=10):
    """Creates the pattern table given a list of patterns

    In addition to normal features under p.attrs['features'] it adds
    logo_imp, logo_seq, profile scores and the directness score

    Args:
      patterns: list of patterns with 'profile' and attrs['features'] and attrs['motif_freq'] features
      cluster_order: n

    """
    # get profile medians for each task across all patterns
    tasks = patterns[0].tasks()
#     profile_max_median = {t: np.median([p.profile[t].max() for p in patterns])
#                           for t in tasks}

    def extract_info(pattern_i):
        """Function ran for each cluster order
        """
        p = patterns[pattern_i]
        # setup the dictionary
        d = p.attrs['features']
        # Add motif frequencies
        # d['pattern'] = pattern_url(d['pattern'], report_url)
        d['cluster'] = p.attrs['cluster']
        d['pattern'] = p.name

        # Add seqlogos
        d['logo_imp'] = p.resize(logo_len).vdom_plot('contrib', as_html=True, **seqlogo_kwargs)
        d['logo_seq'] = p.resize(logo_len).vdom_plot('seq', as_html=True, **seqlogo_kwargs)

#         for t in p.tasks():
#             # add profile sparklines
#             d[t + '/f'] = (vdom_footprint(p.profile[t],
#                                           r_height=profile_max_median[t],
#                                           text=None,
#                                           **footprint_kwargs)
#                            .to_html()
#                            .replace("<img", f"<img width={footprint_width}")
#                            )

#             # Add directness score
#             d[t + "/d"] = 2 * d[t + " imp profile"] - d[t + " imp counts"]
        return d

    l = Parallel(n_jobs=n_jobs)(delayed(extract_info)(i) for i in tqdm(range(len(patterns))))
    df = pd.DataFrame(l)

    # add normalized features
#     for t in tasks:
#         df[t + '/d_p'] = perc(df[t + '/d'])
        
    # use float for n seqlets
    df['n seqlets'] = df['n seqlets'].astype(float)
    return df
In [60]:
patterns_all_clustered[0].attrs
Out[60]:
{'features': {'n seqlets': 135},
 'align': {'use_rc': False, 'offset': -5},
 'cluster': 1}
In [66]:
pattern_table_all = create_pattern_table(patterns_all_clustered,
                                         logo_len=70, 
                                         seqlogo_kwargs=dict(width=420),
                                         n_jobs=20,
                                         footprint_width=120,
                                         footprint_kwargs=dict(figsize=(3,1.5)))
100%|██████████| 39/39 [00:00<00:00, 1413.63it/s]
In [67]:
pattern_table_all.head()
Out[67]:
cluster logo_imp logo_seq n seqlets pattern
0 1 <img width=420 src="d... <img width=420 src="d... 135.0 Klf4_p8
1 1 <img width=420 src="d... <img width=420 src="d... 89.0 Nanog_p15
2 1 <img width=420 src="d... <img width=420 src="d... 164.0 Nanog_p10
3 1 <img width=420 src="d... <img width=420 src="d... 102.0 Oct4_p2
4 1 <img width=420 src="d... <img width=420 src="d... 510.0 Nanog_p3
In [72]:
del pattern_table_all['cluster']

write out a table

In [73]:
from basepair.modisco.table import write_modisco_table

def get_first_columns(df, cols):
    remaining_columns = [c for c in df.columns if c not in cols]
    return df[cols + remaining_columns]

(modisco_dirs / 'motif_clustering').mkdir(exist_ok=True)
In [76]:
first_columns = ['pattern', 'n seqlets', 'logo_seq', 'logo_imp']
In [80]:
pattern_table_all['i'] = np.arange(len(pattern_table_all), dtype=int)
pattern_table_all = get_first_columns(pattern_table_all, ['i'] + first_columns)
write_modisco_table(pattern_table_all, modisco_dirs, report_url='results.html', prefix='pattern_table', write_csv=False, doc={})
In [ ]:
# TODO - for each TF 
In [84]:
ref_modisco_dir = model_dir / f"modisco/all/deeplift/profile"
In [85]:
ls {ref_modisco_dir}
centroid_seqlet_matches.csv  log/                         plots/
cluster-patterns.ipynb       modisco.h5                   results.html
figures/                     motif_clustering/            results.ipynb
footprints.pkl               patterns.pkl                 seqlets/
hparams.yaml                 pattern_table.clustered.csv  spacing/
instances.parq               pattern_table.csv
kwargs.json                  pattern_table.html
In [86]:
ref_patterns = read_pkl(ref_modisco_dir / 'patterns.pkl')
In [87]:
ref_patterns[0].attrs.keys()
Out[87]:
dict_keys(['features', 'align', 'cluster', 'stacked_seqlet_imp'])
In [89]:
ls {ref_modisco_dir / "motif_clustering"}
motif-similarity-test.csv  patterns_long.csv   patterns_short.csv
patterns_all.html          patterns_long.html  patterns_short.html
In [90]:
dfp = pd.read_csv(ref_modisco_dir / 'motif_clustering/patterns_long.csv')
In [91]:
dfp.head()
Out[91]:
i pattern cluster n seqlets Klf4/d_p Nanog/d_p Oct4/d_p Sox2/d_p Klf4 footprint counts Klf4 footprint entropydiff Klf4 footprint max Klf4 footprint strandcor Klf4 imp counts Klf4 imp profile Klf4 periodicity 10bp Klf4 pos meandiff Klf4 pos std Klf4 pos unimodal Klf4 region counts Klf4/d Nanog footprint counts Nanog footprint entropydiff Nanog footprint max Nanog footprint strandcor Nanog imp counts Nanog imp profile Nanog periodicity 10bp Nanog pos meandiff Nanog pos std Nanog pos unimodal Nanog region counts Nanog/d Oct4 footprint counts Oct4 footprint entropydiff Oct4 footprint max Oct4 footprint strandcor Oct4 imp counts Oct4 imp profile Oct4 periodicity 10bp Oct4 pos meandiff Oct4 pos std Oct4 pos unimodal Oct4 region counts Oct4/d Sox2 footprint counts Sox2 footprint entropydiff Sox2 footprint max Sox2 footprint strandcor Sox2 imp counts Sox2 imp profile Sox2 periodicity 10bp Sox2 pos meandiff Sox2 pos std Sox2 pos unimodal Sox2 region counts Sox2/d consensus ic pwm mean idx
0 0 m0_p22 1 97.0 85.7143 89.2857 85.7143 92.8571 244.1134 0.4371 2.7474 0.0061 0.0058 0.0127 0.0094 1.9450 32.5407 True 364.8660 0.0196 310.3299 0.7128 3.9227 0.0077 0.0076 0.0192 0.0179 1.9433 33.3293 True 450.4742 0.0308 200.0412 0.4737 2.7010 0.0063 0.0072 0.0095 0.0133 1.9431 36.7368 True 411.9587 0.0118 108.8660 0.7466 1.8969 0.0083 0.0061 0.0121 0.0255 1.9767 44.5183 True 218.5773 0.0181 GAGTCTTGGAGAAGAGCAGCG... 0.9261 22
1 1 m0_p28 1 83.0 75.0000 100.0000 57.1429 96.4286 276.3132 0.3742 2.9940 0.0057 0.0054 0.0121 0.0124 1.8877 29.8808 True 453.4096 0.0187 639.5422 0.7571 11.6747 0.0073 0.0073 0.0214 0.0177 1.9165 29.8454 True 870.7711 0.0355 168.9518 0.3141 1.4639 0.0052 0.0049 0.0053 0.0081 1.9370 46.9576 True 387.6626 0.0058 180.2530 0.8536 3.9639 0.0098 0.0051 0.0120 0.0216 1.9615 34.4114 True 304.2410 0.0188 GTGTCTTGGAGAAGAGCAGCG... 1.3367 28
2 2 m0_p14 1 208.0 78.5714 96.4286 96.4286 100.0000 184.5962 0.3349 1.6154 0.0057 0.0059 0.0123 0.0084 1.9364 47.0317 True 315.2981 0.0188 511.5385 0.7378 10.6178 0.0085 0.0104 0.0218 0.0107 1.9836 30.9327 True 724.4663 0.0331 371.1250 0.3952 4.6322 0.0063 0.0102 0.0132 0.0190 2.0022 41.3190 True 662.6635 0.0161 107.3894 0.5868 1.6611 0.0075 0.0080 0.0138 0.0239 2.0244 43.0782 True 222.2404 0.0196 GTCTCAGAGAAGAGCAGCGGT... 1.1363 14
3 3 m0_p9 1 315.0 89.2857 92.8571 39.2857 89.2857 343.9016 0.3412 2.8841 0.0061 0.0056 0.0127 0.0095 1.9656 27.7532 True 510.0730 0.0199 448.4095 0.6032 7.0413 0.0074 0.0070 0.0193 0.0121 1.9719 23.8321 True 650.7747 0.0315 137.6572 0.1647 0.8270 0.0053 0.0049 0.0050 0.0057 1.9884 33.9589 True 331.3016 0.0052 147.6572 0.6998 2.8810 0.0096 0.0050 0.0111 0.0263 1.9919 34.7081 True 265.6381 0.0171 GTCTTGAAGAGAACAGGGGCA... 0.9573 9
4 4 m5_p2 1 730.0 71.4286 60.7143 10.7143 64.2857 346.7833 0.2962 2.8573 0.0056 0.0057 0.0121 0.0268 1.9664 51.7878 True 544.0918 0.0184 335.1427 0.4531 4.0480 0.0067 0.0058 0.0140 0.0318 1.9721 43.6132 True 537.7411 0.0223 118.5953 0.1244 0.7174 0.0051 0.0043 0.0040 0.0204 2.0369 90.6160 True 330.7480 0.0038 128.2483 0.6048 2.5384 0.0089 0.0045 0.0088 0.0386 2.0155 70.0416 True 254.5493 0.0131 AGTATCTTGAAGAGGAACAGC... 0.7585 97
In [97]:
shorten_pattern(ref_patterns[0].name)
Out[97]:
'm1_p7'
In [124]:
d_ref_patterns_te = {shorten_pattern(p.name): p for p in ref_patterns}
In [125]:
ref_patterns_te =  [d_ref_patterns_te[pn] for pn in dfp.pattern]
In [127]:
def similarity_matrix_two_patterns(patterns, patterns_b, **kwargs):
    """Compute pattern-pattern similarity matrix 
    """
    m = np.zeros((len(patterns), len(patterns_b)))
    closest_patterns = []
    for i in tqdm(range(len(patterns))):
        for j in range(len(patterns_b)):
            m[i, j] = max(patterns[i].similarity(patterns_b[j], **kwargs),
                          patterns[i].rc().similarity(patterns_b[j], **kwargs))
        j_best = m[i].argmax()
        closest_patterns.append((patterns[i].align(patterns_b[j_best], **kwargs), patterns_b[j_best]))
    dfm = pd.DataFrame(m, index=[p.name for p in patterns],
                       columns = [p.name for p in patterns_b])
    return closest_patterns, dfm
In [120]:
te_merged_patterns = [p for p in patterns if p.seq_info_content > 40]
In [119]:
plt.hist([p.seq_info_content for p in patterns], 30);
In [128]:
pattern_pairs, m = similarity_matrix_two_patterns(te_merged_patterns, ref_patterns_te)
100%|██████████| 17/17 [00:01<00:00,  9.48it/s]
In [130]:
sns.heatmap(m.T)
Out[130]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f021f773da0>
In [134]:
for pa, pb in pattern_pairs:
    pa.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.gca().axison = False
    pb.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.gca().axison = False