import os
import modisco.visualization.viz_sequence as viz_sequence
import h5py
import numpy as np
import pandas as pd
import sklearn.cluster
import scipy.cluster.hierarchy
import collections
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import modisco
import tqdm
tqdm.tqdm_notebook()
# Plotting defaults
font_manager.fontManager.ttflist.extend(
font_manager.createFontList(
font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
)
)
plot_params = {
"figure.titlesize": 22,
"axes.titlesize": 22,
"axes.labelsize": 20,
"legend.fontsize": 18,
"xtick.labelsize": 16,
"ytick.labelsize": 16,
"font.family": "Roboto",
"font.weight": "bold"
}
plt.rcParams.update(plot_params)
input_length = 1346
shap_score_center_size = 400
shap_scores_path = os.environ["TFM_SHAP_PATH"]
tfm_results_path = os.environ["TFM_TFM_PATH"]
peak_bed_path = os.environ["TFM_PEAK_PATH"]
task_index = int(os.environ["TFM_TASK_INDEX"])
profile_hdf5_path = os.environ["TFM_PROFILE_PATH"]
print("DeepSHAP scores path: %s" % shap_scores_path)
print("TF-MoDISco results path: %s" % tfm_results_path)
print("Peak BED path: %s" % peak_bed_path)
print("Task index: %d" % task_index)
print("Profile HDF5 path: %s" % profile_hdf5_path)
def import_shap_scores(shap_scores_path):
"""
Imports the set of SHAP scores used for the TF-MoDISco run. The SHAP scores
are not cut down to a centered size yet.
Arguments:
`shap_scores_path`: path to HDF5 containing SHAP scores used to run
TF-MoDISco, in the same order
Returns an N x I x 4 array of hypothetical scores, an N x I x 4 array of
input sequences, and an N x 3 object array of corresponding coordinates.
"""
with h5py.File(shap_scores_path, "r") as f:
coords_chrom = f["coords_chrom"][:].astype(str)
coords_start = f["coords_start"][:]
coords_end = f["coords_end"][:]
one_hot_seqs = f["input_seqs"][:]
hyp_scores = f["hyp_scores"][:]
coords = np.empty((len(coords_chrom), 3), dtype=object)
coords[:, 0] = coords_chrom
coords[:, 1] = coords_start
coords[:, 2] = coords_end
return hyp_scores, one_hot_seqs, coords
def import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, input_center_cut_size):
"""
Imports the TF-MoDISco results object.
Arguments:
`tfm_results_path`: path to HDF5 containing TF-MoDISco results
`hyp_scores`: hypothetical importance scores used for this run
`one_hot_seqs`: input sequences used for this run
`input_center_cut_size`: centered cut size of SHAP scores used
"""
# Cut everything to `input_center_cut_size`
seq_len = one_hot_seqs.shape[1]
start = (seq_len // 2) - (input_center_cut_size // 2)
end = start + input_center_cut_size
one_hot_seqs = one_hot_seqs[:, start:end]
hyp_scores = hyp_scores[:, start:end]
act_scores = hyp_scores * one_hot_seqs
track_set = modisco.tfmodisco_workflow.workflow.prep_track_set(
task_names=["task0"],
contrib_scores={"task0": act_scores},
hypothetical_contribs={"task0": hyp_scores},
one_hot=one_hot_seqs
)
with h5py.File(tfm_results_path,"r") as f:
return modisco.tfmodisco_workflow.workflow.TfModiscoResults.from_hdf5(f, track_set=track_set)
def extract_coords(seqlets_arr, shap_coords, one_hot_seqs, hyp_scores, input_length, input_center_cut_size):
"""
From the seqlets object of a TF-MoDISco pattern's seqlets and alignments,
extracts the predicted and observed profiles of the model, as well as the
set of coordinates for the seqlets.
Arguments:
`seqlets_arr`: a TF-MoDISco pattern's seqlets object array (M-array)
`shap_coords`: an N x 3 object array of coordinates of DeepSHAP scores
`one_hot_seqs`: an N x I x 4 array of input sequences
`hyp_scores`: an N x I x 4 array of hypothetical importance scores
`input_length`: length of original input sequences, I
`input_center_cut_size`: centered cut size of SHAP scores used
Returns an M x Q x 4 array of one-hot seqlet sequences, an M x Q x 4 array of
seqlet hypothetical importance scores, and an M x 3 object array of seqlet
coordinates (i.e. coordinates of the seqlet not in terms of the DeepSHAP
scores, but in terms of the genome), where Q is the length of seqlets.
Note that it is important that the seqlet indices match exactly with the indices
out of the N. This should be the exact sequences in the original SHAP scores.
"""
seqlet_seqs, seqlet_hyps, seqlet_coords = [], [], []
def seqlet_coord_to_input_coord(seqlet_coord):
return seqlet_coord + ((input_length - input_center_cut_size) // 2)
# For each seqlet, fetch the true/predicted profiles
for seqlet in seqlets_arr:
coord_index = seqlet.coor.example_idx
seqlet_start = seqlet.coor.start
seqlet_end = seqlet.coor.end
seqlet_rc = seqlet.coor.is_revcomp
# Get indices of input sequence corresponding to seqlet
inp_start = seqlet_coord_to_input_coord(seqlet_start)
inp_end = seqlet_coord_to_input_coord(seqlet_end)
if seqlet_rc:
seqlet_seqs.append(np.flip(one_hot_seqs[coord_index, inp_start:inp_end], axis=(0, 1)))
seqlet_hyps.append(np.flip(hyp_scores[coord_index, inp_start:inp_end], axis=(0, 1)))
else:
seqlet_seqs.append(one_hot_seqs[coord_index, inp_start:inp_end])
seqlet_hyps.append(hyp_scores[coord_index, inp_start:inp_end])
# Get the coordinates of the seqlet based on the input coordinates
chrom, start, _ = shap_coords[coord_index]
seqlet_coords.append([chrom, start + inp_start, start + inp_end])
return np.stack(seqlet_seqs), np.stack(seqlet_hyps), np.array(seqlet_coords, dtype=object)
def import_peak_table(peak_bed_paths):
if type(peak_bed_paths) is str:
peak_bed_paths = [peak_bed_paths]
tables = []
for peak_bed_path in peak_bed_paths:
table = pd.read_csv(
peak_bed_path, sep="\t", header=None, # Infer compression
names=[
"chrom", "peak_start", "peak_end", "name", "score",
"strand", "signal", "pval", "qval", "summit_offset"
]
)
# Add summit location column
table["summit"] = table["peak_start"] + table["summit_offset"]
tables.append(table)
return pd.concat(tables)
def import_true_profiles(profile_hdf5_path, coords, task_index):
"""
Imports the set of true profiles from an HDF5.
Arguments:
`profile_hdf5_path`: path to profile HDF5 for the dataset
`coords`: an N x 3 object array of coordinates; all coordinates
need to be the same size.
`task_index`: the specific task to import profiles for
Returns an N x L x 2 array of profiles (negative and positive strands).
"""
with h5py.File(profile_hdf5_path, "r") as f:
profiles = [
f[chrom][start:end, task_index] for chrom, start, end in coords
]
return np.stack(profiles)
def plot_profiles(seqlet_true_profs, kmeans_clusters=5):
"""
Plots the given profiles with a heatmap.
Arguments:
`seqlet_true_profs`: an N x O x 2 NumPy array of true profiles, either as raw
counts or probabilities (they will be normalized)
`kmeans_cluster`: when displaying profile heatmaps, there will be this
many clusters
"""
assert len(seqlet_true_profs.shape) == 3
num_profs, width, _ = seqlet_true_profs.shape
# First, normalize the profiles along the output profile dimension
def normalize(arr, axis=0):
arr_sum = np.sum(arr, axis=axis, keepdims=True)
arr_sum[arr_sum == 0] = 1 # If 0, keep 0 as the quotient instead of dividing by 0
return arr / arr_sum
true_profs_norm = normalize(seqlet_true_profs, axis=1)
# Compute the mean profiles across all examples
true_profs_mean = np.mean(true_profs_norm, axis=0)
# Perform k-means clustering on the true profiles, with the strands pooled
kmeans_clusters = max(5, num_profs // 50) # Set number of clusters based on number of profiles, with minimum
kmeans = sklearn.cluster.KMeans(n_clusters=kmeans_clusters)
cluster_assignments = kmeans.fit_predict(
np.reshape(true_profs_norm, (true_profs_norm.shape[0], -1))
)
# Perform hierarchical clustering on the cluster centers to determine optimal ordering
kmeans_centers = kmeans.cluster_centers_
cluster_order = scipy.cluster.hierarchy.leaves_list(
scipy.cluster.hierarchy.optimal_leaf_ordering(
scipy.cluster.hierarchy.linkage(kmeans_centers, method="centroid"), kmeans_centers
)
)
# Order the profiles so that the cluster assignments follow the optimal ordering
cluster_inds = []
for cluster_id in cluster_order:
cluster_inds.append(np.where(cluster_assignments == cluster_id)[0])
cluster_inds = np.concatenate(cluster_inds)
# Compute a matrix of profiles, normalized to the maximum height, ordered by clusters
def make_profile_matrix(flat_profs, order_inds):
matrix = flat_profs[order_inds]
maxes = np.max(matrix, axis=1, keepdims=True)
maxes[maxes == 0] = 1 # If 0, keep 0 as the quotient instead of dividing by 0
return matrix / maxes
true_matrix = make_profile_matrix(true_profs_norm, cluster_inds)
# Create a figure with the right dimensions
mean_height = 4
heatmap_height = min(num_profs * 0.004, 8)
fig_height = mean_height + (2 * heatmap_height)
fig, ax = plt.subplots(
nrows=3, ncols=1, figsize=(16, fig_height), sharex=True,
gridspec_kw={
"height_ratios": [mean_height / fig_height, heatmap_height / fig_height, heatmap_height / fig_height]
}
)
# Plot the average profiles
ax[0].plot(true_profs_mean[:, 0], color="darkslateblue")
ax[0].plot(-true_profs_mean[:, 1], color="darkorange")
# Set axes on average profiles
max_mean_val = np.max(true_profs_mean)
mean_ylim = max_mean_val * 1.05 # Make 5% higher
ax[0].set_title("Observed (experimental) profiles")
ax[0].set_ylabel("Average probability")
ax[0].set_ylim(-mean_ylim, mean_ylim)
ax[0].label_outer()
# Plot the heatmaps
ax[1].imshow(true_matrix[:, :, 0], interpolation="nearest", aspect="auto", cmap="Blues")
ax[2].imshow(true_matrix[:, :, 1], interpolation="nearest", aspect="auto", cmap="Oranges")
# Set axes on heatmaps
for i in (1, 2):
ax[i].set_yticks([])
ax[i].set_yticklabels([])
ax[i].label_outer()
width = true_matrix.shape[1]
delta = 100
num_deltas = (width // 2) // delta
labels = list(range(max(-width // 2, -num_deltas * delta), min(width // 2, num_deltas * delta) + 1, delta))
tick_locs = [label + max(width // 2, num_deltas * delta) for label in labels]
ax[2].set_xticks(tick_locs)
ax[2].set_xticklabels(labels)
ax[2].set_xlabel("Distance from peak summit (bp)")
fig.tight_layout()
plt.show()
def get_summit_distances(coords, peak_table):
"""
Given a set of coordinates, computes the distance of the center of each
coordinate to the nearest summit.
Arguments:
`coords`: an N x 3 object array of coordinates
`peak_table`: a 6-column table of peak data, as imported by
`import_peak_table`
Returns and N-array of integers, which is the distance of each coordinate
midpoint to the nearest coordinate.
"""
chroms = coords[:, 0]
midpoints = (coords[:, 1] + coords[:, 2]) // 2
dists = []
for i in range(len(coords)):
chrom = chroms[i]
midpoint = midpoints[i]
rows = peak_table[peak_table["chrom"] == chrom]
dist_arr = (midpoint - rows["summit"]).values
min_dist = dist_arr[np.argmin(np.abs(dist_arr))]
dists.append(min_dist)
return np.array(dists)
def plot_summit_dists(summit_dists):
"""
Plots the distribution of seqlet distances to summits.
Arguments:
`summit_dists`: the array of distances as returned by
`get_summit_distances`
"""
plt.figure(figsize=(8, 6))
num_bins = max(len(summit_dists) // 30, 20)
plt.hist(summit_dists, bins=num_bins, color="purple")
plt.title("Histogram of distance of seqlets to peak summits")
plt.xlabel("Signed distance from seqlet center to nearest peak summit (bp)")
plt.show()
background_freqs = np.array([0.27, 0.23, 0.23, 0.27])
def info_content(track, pseudocount=0.001):
"""
Given an L x 4 track, computes information content for each base and
returns it as an L-array.
"""
num_bases = track.shape[1]
# Normalize track to probabilities along base axis
track_norm = (track + pseudocount) / (np.sum(track, axis=1, keepdims=True) + (num_bases * pseudocount))
ic = track_norm * np.log2(track_norm / np.expand_dims(background_freqs, axis=0))
return np.sum(ic, axis=1)
# Import SHAP coordinates and one-hot sequences
hyp_scores, one_hot_seqs, shap_coords = import_shap_scores(shap_scores_path)
# Import the set of peaks
peak_table = import_peak_table(peak_bed_path)
# Import the TF-MoDISco results object
tfm_obj = import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)
Plot the central region of some randomly selected actual importance scores
for index in np.random.choice(hyp_scores.shape[0], size=5, replace=False):
viz_sequence.plot_weights((hyp_scores[index] * one_hot_seqs[index])[570:770], subticks_frequency=100)
motifs = [] # Save the motifs as trimmed CWMs
num_seqlets = [] # Number of seqlets for each motif
motif_seqlets = {} # Save seqlets of each motif
metaclusters = tfm_obj.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
metacluster = metaclusters[metacluster_key]
print("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters))
print("==========================================")
patterns = metacluster.seqlets_to_patterns_result.patterns
if not patterns:
break
num_patterns = len(patterns)
for pattern_i, pattern in enumerate(patterns):
seqlets = pattern.seqlets
print("Pattern %d/%d" % (pattern_i + 1, num_patterns))
print("--------------------------------------")
pfm = pattern["sequence"].fwd
hcwm = pattern["task0_hypothetical_contribs"].fwd
cwm = pattern["task0_contrib_scores"].fwd
print("%d seqlets" % len(seqlets))
print("Sequence")
viz_sequence.plot_weights(pfm, subticks_frequency=10)
print("Hypothetical contributions")
viz_sequence.plot_weights(hcwm, subticks_frequency=10)
print("Actual contributions (CWM)")
viz_sequence.plot_weights(cwm, subticks_frequency=10)
# Trim motif based on IC
pfm_ic = info_content(pfm)
ic_trim_thresh = np.max(pfm_ic) * 0.2 # Cut off anything less than 20% of max IC
pass_inds = np.where(pfm_ic >= ic_trim_thresh)[0]
trimmed_cwm = cwm[np.min(pass_inds): np.max(pass_inds) + 1]
motifs.append(trimmed_cwm)
num_seqlets.append(len(seqlets))
# Get the importance scores, input sequences, and coordinates of underlying seqlets
seqlet_seqs, seqlet_hyps, seqlet_coords = extract_coords(
seqlets, shap_coords, one_hot_seqs, hyp_scores, input_length, shap_score_center_size
)
key_pair = (metacluster_key, pattern_i)
motif_seqlets[key_pair] = (seqlet_seqs, seqlet_hyps)
assert np.allclose(np.sum(seqlet_seqs, axis=0) / len(seqlet_seqs), pattern["sequence"].fwd)
# ^Sanity check: PFM derived from seqlets match the PFM stored in the pattern
# Get the set of profiles
seqlet_true_profs = import_true_profiles(profile_hdf5_path, seqlet_coords, task_index)
print("Observed (experimental) profiles")
plot_profiles(seqlet_true_profs)
print("Seqlet distance from summits")
summit_dists = get_summit_distances(seqlet_coords, peak_table)
plot_summit_dists(summit_dists)
for i in np.flip(np.argsort(num_seqlets)):
print("Number of seqlets: %d" % num_seqlets[i])
viz_sequence.plot_weights(motifs[i])
num_seqlets_to_show = 20
metaclusters = tfm_obj.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
metacluster = metaclusters[metacluster_key]
print("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters))
print("==========================================")
patterns = metacluster.seqlets_to_patterns_result.patterns
if not patterns:
break
num_patterns = len(patterns)
for pattern_i, pattern in enumerate(patterns):
seqlets = pattern.seqlets
print("Pattern %d/%d" % (pattern_i + 1, num_patterns))
print("--------------------------------------")
print("Actual contributions (CWM)")
viz_sequence.plot_weights(pattern["task0_contrib_scores"].fwd, subticks_frequency=10)
key_pair = (metacluster_key, pattern_i)
seqlet_seqs, seqlet_hyps = motif_seqlets[key_pair]
print("Sample of seqlets")
sample_size = min(num_seqlets_to_show, len(seqlet_seqs))
sample_inds = np.random.choice(len(seqlet_seqs), size=sample_size, replace=False)
for i in sample_inds:
viz_sequence.plot_weights(seqlet_seqs[i] * seqlet_hyps[i], subticks_frequency=10)