Goal¶

cluster the motifs per tf
match the TE's to the all/deeplift/profile run

Tasks¶

Load patterns from each task
- add the annotation where they came from
merge all together
get the affinity matrix
cluster them
write out a table

# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
hv.extension('bokeh')

Using TensorFlow backend.

# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dirs = model_dir / f"modisco/by_peak_tasks/per-tf"
tasks = ['Oct4', 'Sox2', 'Nanog', 'Klf4']

# create_tf_session(0)

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from uuid import uuid4
from basepair.math import mean
from basepair.stats import perc

from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable, vdom_footprint
from basepair.plot.utils import strip_axis
from basepair.plot.tracks import tidy_motif_plot
from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_short
from basepair.imports import *

paper_config()

Load patterns from each task¶

add the annotation where they came from

patterns[0].name

'metacluster_0/pattern_0'

mr

mr = ModiscoResult(modisco_dirs / tf / "modisco.h5")
mr.open()

patterns = []
def rename(p, tf):
    p = p.copy()
    p.name = tf + "_" + shorten_pattern(p.name).split("_")[1]
    return p
for tf in tasks:
    mr = ModiscoResult(modisco_dirs / tf / "modisco.h5")
    mr.open()
    patterns = patterns + [rename(mr.get_pattern(p), tf).add_attr("features", 
                                                                  {"n seqlets": mr.n_seqlets(*p.split("/"))}) 
                           for p in mr.patterns()]
    mr.close()

get the affinity matrix¶

from basepair.exp.chipnexus.motif_clustering import similarity_matrix
sim_all_seq = similarity_matrix(patterns, track='seq_ic')

100%|██████████| 39/39 [00:06<00:00,  6.79it/s]

fig = plt.figure(figsize=get_figsize(0.25))
iu_all = np.triu_indices(len(sim_all_seq), 1)
plt.hist(sim_all_seq[iu_all], bins=100);
plt.title("Similarity (seq_ic)")
plt.ylabel("Frequency");
plt.xlabel("Similarity between two motifs (all)");
# plt.savefig(f"{figures}/all.similarity.hist.pdf")
# plt.savefig(f"{figures}/all.similarity.hist.png")

from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering, cut_tree, leaves_list
from basepair.modisco.motif_clustering import to_colors

cluster them¶

iu_all = np.triu_indices(len(sim_all_seq), 1)
lm_all_seq = linkage(1-sim_all_seq[iu_all], 'ward', optimal_ordering=True)
clusters_all_seq = cut_tree(lm_all_seq, n_clusters=9)[:,0]
names = [x.name for x in patterns]
cm_all = sns.clustermap(pd.DataFrame(sim_all_seq, index=names, columns=names),
                        row_linkage=lm_all_seq,
                        col_linkage=lm_all_seq,
                        cmap='Blues',
                        yticklabels=True,
                        figsize=get_figsize(1, aspect=1)
                       ).fig.suptitle('Seq IC sim');
# plt.savefig(f"{figures}/all.heatmap.pdf")
# plt.savefig(f"{figures}/all.heatmap.png")

Align motifs¶

from basepair.modisco.motif_clustering import create_pattern_table, align_clustered_patterns
np.random.seed(42)
cluster_order = np.argsort(leaves_list(lm_all_seq))
cluster = clusters_all_seq

patterns_all_clustered = align_clustered_patterns(patterns, cluster_order, cluster, 
                                                 align_track='seq_ic',
                                                 metric='continousjaccard',
                                                 # don't shit the major patterns 
                                                 # by more than 15 when aligning 
                                                 trials=20,
                                                 max_shift=15)

100%|██████████| 39/39 [00:00<00:00, 280.36it/s]

def create_pattern_table(patterns,
                         logo_len=30,
                         footprint_width=320,
                         footprint_kwargs=None,
                         seqlogo_kwargs=None,
                         # report_url='results.html'
                         n_jobs=10):
    """Creates the pattern table given a list of patterns

    In addition to normal features under p.attrs['features'] it adds
    logo_imp, logo_seq, profile scores and the directness score

    Args:
      patterns: list of patterns with 'profile' and attrs['features'] and attrs['motif_freq'] features
      cluster_order: n

    """
    # get profile medians for each task across all patterns
    tasks = patterns[0].tasks()
#     profile_max_median = {t: np.median([p.profile[t].max() for p in patterns])
#                           for t in tasks}

    def extract_info(pattern_i):
        """Function ran for each cluster order
        """
        p = patterns[pattern_i]
        # setup the dictionary
        d = p.attrs['features']
        # Add motif frequencies
        # d['pattern'] = pattern_url(d['pattern'], report_url)
        d['cluster'] = p.attrs['cluster']
        d['pattern'] = p.name

        # Add seqlogos
        d['logo_imp'] = p.resize(logo_len).vdom_plot('contrib', as_html=True, **seqlogo_kwargs)
        d['logo_seq'] = p.resize(logo_len).vdom_plot('seq', as_html=True, **seqlogo_kwargs)

#         for t in p.tasks():
#             # add profile sparklines
#             d[t + '/f'] = (vdom_footprint(p.profile[t],
#                                           r_height=profile_max_median[t],
#                                           text=None,
#                                           **footprint_kwargs)
#                            .to_html()
#                            .replace("<img", f"<img width={footprint_width}")
#                            )

#             # Add directness score
#             d[t + "/d"] = 2 * d[t + " imp profile"] - d[t + " imp counts"]
        return d

    l = Parallel(n_jobs=n_jobs)(delayed(extract_info)(i) for i in tqdm(range(len(patterns))))
    df = pd.DataFrame(l)

    # add normalized features
#     for t in tasks:
#         df[t + '/d_p'] = perc(df[t + '/d'])
        
    # use float for n seqlets
    df['n seqlets'] = df['n seqlets'].astype(float)
    return df

patterns_all_clustered[0].attrs

{'features': {'n seqlets': 135},
 'align': {'use_rc': False, 'offset': -5},
 'cluster': 1}

pattern_table_all = create_pattern_table(patterns_all_clustered,
                                         logo_len=70, 
                                         seqlogo_kwargs=dict(width=420),
                                         n_jobs=20,
                                         footprint_width=120,
                                         footprint_kwargs=dict(figsize=(3,1.5)))

100%|██████████| 39/39 [00:00<00:00, 1413.63it/s]

pattern_table_all.head()

del pattern_table_all['cluster']

write out a table¶

from basepair.modisco.table import write_modisco_table

def get_first_columns(df, cols):
    remaining_columns = [c for c in df.columns if c not in cols]
    return df[cols + remaining_columns]

(modisco_dirs / 'motif_clustering').mkdir(exist_ok=True)

first_columns = ['pattern', 'n seqlets', 'logo_seq', 'logo_imp']

pattern_table_all['i'] = np.arange(len(pattern_table_all), dtype=int)
pattern_table_all = get_first_columns(pattern_table_all, ['i'] + first_columns)
write_modisco_table(pattern_table_all, modisco_dirs, report_url='results.html', prefix='pattern_table', write_csv=False, doc={})

# TODO - for each TF

ref_modisco_dir = model_dir / f"modisco/all/deeplift/profile"

ls {ref_modisco_dir}

centroid_seqlet_matches.csv  log/                         plots/
cluster-patterns.ipynb       modisco.h5                   results.html
figures/                     motif_clustering/            results.ipynb
footprints.pkl               patterns.pkl                 seqlets/
hparams.yaml                 pattern_table.clustered.csv  spacing/
instances.parq               pattern_table.csv
kwargs.json                  pattern_table.html

ref_patterns = read_pkl(ref_modisco_dir / 'patterns.pkl')

ref_patterns[0].attrs.keys()

dict_keys(['features', 'align', 'cluster', 'stacked_seqlet_imp'])

ls {ref_modisco_dir / "motif_clustering"}

motif-similarity-test.csv  patterns_long.csv   patterns_short.csv
patterns_all.html          patterns_long.html  patterns_short.html

dfp = pd.read_csv(ref_modisco_dir / 'motif_clustering/patterns_long.csv')

dfp.head()

shorten_pattern(ref_patterns[0].name)

'm1_p7'

d_ref_patterns_te = {shorten_pattern(p.name): p for p in ref_patterns}

ref_patterns_te =  [d_ref_patterns_te[pn] for pn in dfp.pattern]

def similarity_matrix_two_patterns(patterns, patterns_b, **kwargs):
    """Compute pattern-pattern similarity matrix 
    """
    m = np.zeros((len(patterns), len(patterns_b)))
    closest_patterns = []
    for i in tqdm(range(len(patterns))):
        for j in range(len(patterns_b)):
            m[i, j] = max(patterns[i].similarity(patterns_b[j], **kwargs),
                          patterns[i].rc().similarity(patterns_b[j], **kwargs))
        j_best = m[i].argmax()
        closest_patterns.append((patterns[i].align(patterns_b[j_best], **kwargs), patterns_b[j_best]))
    dfm = pd.DataFrame(m, index=[p.name for p in patterns],
                       columns = [p.name for p in patterns_b])
    return closest_patterns, dfm

te_merged_patterns = [p for p in patterns if p.seq_info_content > 40]

plt.hist([p.seq_info_content for p in patterns], 30);

pattern_pairs, m = similarity_matrix_two_patterns(te_merged_patterns, ref_patterns_te)

100%|██████████| 17/17 [00:01<00:00,  9.48it/s]

sns.heatmap(m.T)

<matplotlib.axes._subplots.AxesSubplot at 0x7f021f773da0>

for pa, pb in pattern_pairs:
    pa.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.gca().axison = False
    pb.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.gca().axison = False

	cluster	logo_imp	logo_seq	n seqlets	pattern
0	1	<img width=420 src="d...	<img width=420 src="d...	135.0	Klf4_p8
1	1	<img width=420 src="d...	<img width=420 src="d...	89.0	Nanog_p15
2	1	<img width=420 src="d...	<img width=420 src="d...	164.0	Nanog_p10
3	1	<img width=420 src="d...	<img width=420 src="d...	102.0	Oct4_p2
4	1	<img width=420 src="d...	<img width=420 src="d...	510.0	Nanog_p3

	i	pattern	cluster	n seqlets	Klf4/d_p	Nanog/d_p	Oct4/d_p	Sox2/d_p	Klf4 footprint counts	Klf4 footprint entropydiff	Klf4 footprint max	Klf4 footprint strandcor	Klf4 imp counts	Klf4 imp profile	Klf4 periodicity 10bp	Klf4 pos meandiff	Klf4 pos std	Klf4 pos unimodal	Klf4 region counts	Klf4/d	Nanog footprint counts	Nanog footprint entropydiff	Nanog footprint max	Nanog footprint strandcor	Nanog imp counts	Nanog imp profile	Nanog periodicity 10bp	Nanog pos meandiff	Nanog pos std	Nanog pos unimodal	Nanog region counts	Nanog/d	Oct4 footprint counts	Oct4 footprint entropydiff	Oct4 footprint max	Oct4 footprint strandcor	Oct4 imp counts	Oct4 imp profile	Oct4 periodicity 10bp	Oct4 pos meandiff	Oct4 pos std	Oct4 pos unimodal	Oct4 region counts	Oct4/d	Sox2 footprint counts	Sox2 footprint entropydiff	Sox2 footprint max	Sox2 footprint strandcor	Sox2 imp counts	Sox2 imp profile	Sox2 periodicity 10bp	Sox2 pos meandiff	Sox2 pos std	Sox2 pos unimodal	Sox2 region counts	Sox2/d	consensus	ic pwm mean	idx
0	0	m0_p22	1	97.0	85.7143	89.2857	85.7143	92.8571	244.1134	0.4371	2.7474	0.0061	0.0058	0.0127	0.0094	1.9450	32.5407	True	364.8660	0.0196	310.3299	0.7128	3.9227	0.0077	0.0076	0.0192	0.0179	1.9433	33.3293	True	450.4742	0.0308	200.0412	0.4737	2.7010	0.0063	0.0072	0.0095	0.0133	1.9431	36.7368	True	411.9587	0.0118	108.8660	0.7466	1.8969	0.0083	0.0061	0.0121	0.0255	1.9767	44.5183	True	218.5773	0.0181	GAGTCTTGGAGAAGAGCAGCG...	0.9261	22
1	1	m0_p28	1	83.0	75.0000	100.0000	57.1429	96.4286	276.3132	0.3742	2.9940	0.0057	0.0054	0.0121	0.0124	1.8877	29.8808	True	453.4096	0.0187	639.5422	0.7571	11.6747	0.0073	0.0073	0.0214	0.0177	1.9165	29.8454	True	870.7711	0.0355	168.9518	0.3141	1.4639	0.0052	0.0049	0.0053	0.0081	1.9370	46.9576	True	387.6626	0.0058	180.2530	0.8536	3.9639	0.0098	0.0051	0.0120	0.0216	1.9615	34.4114	True	304.2410	0.0188	GTGTCTTGGAGAAGAGCAGCG...	1.3367	28
2	2	m0_p14	1	208.0	78.5714	96.4286	96.4286	100.0000	184.5962	0.3349	1.6154	0.0057	0.0059	0.0123	0.0084	1.9364	47.0317	True	315.2981	0.0188	511.5385	0.7378	10.6178	0.0085	0.0104	0.0218	0.0107	1.9836	30.9327	True	724.4663	0.0331	371.1250	0.3952	4.6322	0.0063	0.0102	0.0132	0.0190	2.0022	41.3190	True	662.6635	0.0161	107.3894	0.5868	1.6611	0.0075	0.0080	0.0138	0.0239	2.0244	43.0782	True	222.2404	0.0196	GTCTCAGAGAAGAGCAGCGGT...	1.1363	14
3	3	m0_p9	1	315.0	89.2857	92.8571	39.2857	89.2857	343.9016	0.3412	2.8841	0.0061	0.0056	0.0127	0.0095	1.9656	27.7532	True	510.0730	0.0199	448.4095	0.6032	7.0413	0.0074	0.0070	0.0193	0.0121	1.9719	23.8321	True	650.7747	0.0315	137.6572	0.1647	0.8270	0.0053	0.0049	0.0050	0.0057	1.9884	33.9589	True	331.3016	0.0052	147.6572	0.6998	2.8810	0.0096	0.0050	0.0111	0.0263	1.9919	34.7081	True	265.6381	0.0171	GTCTTGAAGAGAACAGGGGCA...	0.9573	9
4	4	m5_p2	1	730.0	71.4286	60.7143	10.7143	64.2857	346.7833	0.2962	2.8573	0.0056	0.0057	0.0121	0.0268	1.9664	51.7878	True	544.0918	0.0184	335.1427	0.4531	4.0480	0.0067	0.0058	0.0140	0.0318	1.9721	43.6132	True	537.7411	0.0223	118.5953	0.1244	0.7174	0.0051	0.0043	0.0040	0.0204	2.0369	90.6160	True	330.7480	0.0038	128.2483	0.6048	2.5384	0.0089	0.0045	0.0088	0.0386	2.0155	70.0416	True	254.5493	0.0131	AGTATCTTGAAGAGGAACAGC...	0.7585	97