In [1]:
import os
import modisco.visualization.viz_sequence as viz_sequence
import h5py
import numpy as np
import pandas as pd
import sklearn.cluster
import scipy.cluster.hierarchy
import collections
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import modisco
import tqdm
tqdm.tqdm_notebook()
TF-MoDISco is using the TensorFlow backend.
/users/amtseng/miniconda3/envs/tfmodisco-mini2/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
/users/amtseng/miniconda3/envs/tfmodisco-mini2/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
/users/amtseng/miniconda3/envs/tfmodisco-mini2/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
/users/amtseng/miniconda3/envs/tfmodisco-mini2/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
/users/amtseng/miniconda3/envs/tfmodisco-mini2/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
/users/amtseng/miniconda3/envs/tfmodisco-mini2/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  np_resource = np.dtype([("resource", np.ubyte, 1)])
/users/amtseng/miniconda3/envs/tfmodisco-mini2/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
/users/amtseng/miniconda3/envs/tfmodisco-mini2/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
/users/amtseng/miniconda3/envs/tfmodisco-mini2/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
/users/amtseng/miniconda3/envs/tfmodisco-mini2/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
/users/amtseng/miniconda3/envs/tfmodisco-mini2/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
/users/amtseng/miniconda3/envs/tfmodisco-mini2/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  np_resource = np.dtype([("resource", np.ubyte, 1)])
/users/amtseng/miniconda3/envs/tfmodisco-mini2/lib/python3.7/site-packages/sklearn/utils/deprecation.py:143: FutureWarning: The sklearn.neighbors.kde module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.neighbors. Anything that cannot be imported from sklearn.neighbors is now part of the private API.
  warnings.warn(message, FutureWarning)
/users/amtseng/miniconda3/envs/tfmodisco-mini2/lib/python3.7/site-packages/ipykernel_launcher.py:13: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  del sys.path[0]
Out[1]:
0it [00:00, ?it/s]
In [2]:
# Plotting defaults
font_manager.fontManager.ttflist.extend(
    font_manager.createFontList(
        font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
    )
)
plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.family": "Roboto",
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)
/users/amtseng/miniconda3/envs/tfmodisco-mini2/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: 
The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.
  after removing the cwd from sys.path.

Define constants and paths

In [3]:
input_length = 1346
shap_score_center_size = 400
In [4]:
shap_scores_path = os.environ["TFM_SHAP_PATH"]
tfm_results_path = os.environ["TFM_TFM_PATH"]
peak_bed_path = os.environ["TFM_PEAK_PATH"]
task_index = int(os.environ["TFM_TASK_INDEX"])
profile_hdf5_path = os.environ["TFM_PROFILE_PATH"]
print("DeepSHAP scores path: %s" % shap_scores_path)
print("TF-MoDISco results path: %s" % tfm_results_path)
print("Peak BED path: %s" % peak_bed_path)
print("Task index: %d" % task_index)
print("Profile HDF5 path: %s" % profile_hdf5_path)
DeepSHAP scores path: /users/amtseng/gata1/results/shap_scores/cutnrun//gata1_shap_scores_task3.h5
TF-MoDISco results path: /users/amtseng/gata1/results/tfmodisco/cutnrun//gata1_tfm_task3.h5
Peak BED path: /users/amtseng/gata1/data/processed/cutnrun/labels//gata1_patient-day12_idr-peaks.bed.gz
Task index: 3
Profile HDF5 path: /users/amtseng/gata1/data/processed/cutnrun/labels//gata1_profiles.h5

Helper functions

In [5]:
def import_shap_scores(shap_scores_path):
    """
    Imports the set of SHAP scores used for the TF-MoDISco run. The SHAP scores
    are not cut down to a centered size yet.
    Arguments:
        `shap_scores_path`: path to HDF5 containing SHAP scores used to run
            TF-MoDISco, in the same order
    Returns an N x I x 4 array of hypothetical scores, an N x I x 4 array of
    input sequences, and an N x 3 object array of corresponding coordinates.
    """
    with h5py.File(shap_scores_path, "r") as f:
        coords_chrom = f["coords_chrom"][:].astype(str)
        coords_start = f["coords_start"][:]
        coords_end = f["coords_end"][:]
        one_hot_seqs = f["input_seqs"][:]
        hyp_scores = f["hyp_scores"][:]
    
    coords = np.empty((len(coords_chrom), 3), dtype=object)
    coords[:, 0] = coords_chrom
    coords[:, 1] = coords_start
    coords[:, 2] = coords_end
    
    return hyp_scores, one_hot_seqs, coords
In [6]:
def import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, input_center_cut_size):
    """
    Imports the TF-MoDISco results object.
    Arguments:
        `tfm_results_path`: path to HDF5 containing TF-MoDISco results
        `hyp_scores`: hypothetical importance scores used for this run
        `one_hot_seqs`: input sequences used for this run
        `input_center_cut_size`: centered cut size of SHAP scores used
    """ 
    # Cut everything to `input_center_cut_size`
    seq_len = one_hot_seqs.shape[1]
    start = (seq_len // 2) - (input_center_cut_size // 2)
    end = start + input_center_cut_size
    one_hot_seqs = one_hot_seqs[:, start:end]
    hyp_scores = hyp_scores[:, start:end]
    act_scores = hyp_scores * one_hot_seqs
    
    track_set = modisco.tfmodisco_workflow.workflow.prep_track_set(
        task_names=["task0"],
        contrib_scores={"task0": act_scores},
        hypothetical_contribs={"task0": hyp_scores},
        one_hot=one_hot_seqs
    )
    
    with h5py.File(tfm_results_path,"r") as f:
        return modisco.tfmodisco_workflow.workflow.TfModiscoResults.from_hdf5(f, track_set=track_set)
In [7]:
def extract_coords(seqlets_arr, shap_coords, one_hot_seqs, hyp_scores, input_length, input_center_cut_size):
    """
    From the seqlets object of a TF-MoDISco pattern's seqlets and alignments,
    extracts the predicted and observed profiles of the model, as well as the
    set of coordinates for the seqlets.
    Arguments:
        `seqlets_arr`: a TF-MoDISco pattern's seqlets object array (M-array)
        `shap_coords`: an N x 3 object array of coordinates of DeepSHAP scores
        `one_hot_seqs`: an N x I x 4 array of input sequences
        `hyp_scores`: an N x I x 4 array of hypothetical importance scores
        `input_length`: length of original input sequences, I
        `input_center_cut_size`: centered cut size of SHAP scores used
    Returns an M x Q x 4 array of one-hot seqlet sequences, an M x Q x 4 array of
    seqlet hypothetical importance scores, and an M x 3 object array of seqlet
    coordinates (i.e. coordinates of the seqlet not in terms of the DeepSHAP
    scores, but in terms of the genome), where Q is the length of seqlets.
    Note that it is important that the seqlet indices match exactly with the indices
    out of the N. This should be the exact sequences in the original SHAP scores.
    """
    seqlet_seqs, seqlet_hyps, seqlet_coords = [], [], []

    def seqlet_coord_to_input_coord(seqlet_coord):
        return seqlet_coord + ((input_length - input_center_cut_size) // 2)
        
    # For each seqlet, fetch the true/predicted profiles
    for seqlet in seqlets_arr:
        coord_index = seqlet.coor.example_idx
        seqlet_start = seqlet.coor.start
        seqlet_end = seqlet.coor.end
        seqlet_rc = seqlet.coor.is_revcomp
        
        # Get indices of input sequence corresponding to seqlet
        inp_start = seqlet_coord_to_input_coord(seqlet_start)
        inp_end = seqlet_coord_to_input_coord(seqlet_end)
        
        if seqlet_rc:
            seqlet_seqs.append(np.flip(one_hot_seqs[coord_index, inp_start:inp_end], axis=(0, 1)))
            seqlet_hyps.append(np.flip(hyp_scores[coord_index, inp_start:inp_end], axis=(0, 1)))
        else:
            seqlet_seqs.append(one_hot_seqs[coord_index, inp_start:inp_end])
            seqlet_hyps.append(hyp_scores[coord_index, inp_start:inp_end])
            
        # Get the coordinates of the seqlet based on the input coordinates
        chrom, start, _ = shap_coords[coord_index]
        seqlet_coords.append([chrom, start + inp_start, start + inp_end])
    
    return np.stack(seqlet_seqs), np.stack(seqlet_hyps), np.array(seqlet_coords, dtype=object)
In [8]:
def import_peak_table(peak_bed_paths):
    if type(peak_bed_paths) is str:
        peak_bed_paths = [peak_bed_paths]
    tables = []
    for peak_bed_path in peak_bed_paths:
        table = pd.read_csv(
            peak_bed_path, sep="\t", header=None,  # Infer compression
            names=[
                "chrom", "peak_start", "peak_end", "name", "score",
                "strand", "signal", "pval", "qval", "summit_offset"
            ]
        )
        # Add summit location column
        table["summit"] = table["peak_start"] + table["summit_offset"]
        tables.append(table)
    return pd.concat(tables)
In [9]:
def import_true_profiles(profile_hdf5_path, coords, task_index):
    """
    Imports the set of true profiles from an HDF5.
    Arguments:
        `profile_hdf5_path`: path to profile HDF5 for the dataset
        `coords`: an N x 3 object array of coordinates; all coordinates
            need to be the same size.
        `task_index`: the specific task to import profiles for
    Returns an N x L x 2 array of profiles (negative and positive strands).
    """
    with h5py.File(profile_hdf5_path, "r") as f:
        profiles = [
            f[chrom][start:end, task_index] for chrom, start, end in coords
        ]
    return np.stack(profiles)
In [10]:
def plot_profiles(seqlet_true_profs, kmeans_clusters=5):
    """
    Plots the given profiles with a heatmap.
    Arguments:
        `seqlet_true_profs`: an N x O x 2 NumPy array of true profiles, either as raw
            counts or probabilities (they will be normalized)
        `kmeans_cluster`: when displaying profile heatmaps, there will be this
            many clusters
    """
    assert len(seqlet_true_profs.shape) == 3
    num_profs, width, _ = seqlet_true_profs.shape

    # First, normalize the profiles along the output profile dimension
    def normalize(arr, axis=0):
        arr_sum = np.sum(arr, axis=axis, keepdims=True)
        arr_sum[arr_sum == 0] = 1  # If 0, keep 0 as the quotient instead of dividing by 0
        return arr / arr_sum
    true_profs_norm = normalize(seqlet_true_profs, axis=1)

    # Compute the mean profiles across all examples
    true_profs_mean = np.mean(true_profs_norm, axis=0)

    # Perform k-means clustering on the true profiles, with the strands pooled
    kmeans_clusters = max(5, num_profs // 50)  # Set number of clusters based on number of profiles, with minimum
    kmeans = sklearn.cluster.KMeans(n_clusters=kmeans_clusters)
    cluster_assignments = kmeans.fit_predict(
        np.reshape(true_profs_norm, (true_profs_norm.shape[0], -1))
    )

    # Perform hierarchical clustering on the cluster centers to determine optimal ordering
    kmeans_centers = kmeans.cluster_centers_
    cluster_order = scipy.cluster.hierarchy.leaves_list(
        scipy.cluster.hierarchy.optimal_leaf_ordering(
            scipy.cluster.hierarchy.linkage(kmeans_centers, method="centroid"), kmeans_centers
        )
    )

    # Order the profiles so that the cluster assignments follow the optimal ordering
    cluster_inds = []
    for cluster_id in cluster_order:
        cluster_inds.append(np.where(cluster_assignments == cluster_id)[0])
    cluster_inds = np.concatenate(cluster_inds)

    # Compute a matrix of profiles, normalized to the maximum height, ordered by clusters
    def make_profile_matrix(flat_profs, order_inds):
        matrix = flat_profs[order_inds]
        maxes = np.max(matrix, axis=1, keepdims=True)
        maxes[maxes == 0] = 1  # If 0, keep 0 as the quotient instead of dividing by 0
        return matrix / maxes
    true_matrix = make_profile_matrix(true_profs_norm, cluster_inds)

    # Create a figure with the right dimensions
    mean_height = 4
    heatmap_height = min(num_profs * 0.004, 8)
    fig_height = mean_height + (2 * heatmap_height)
    fig, ax = plt.subplots(
        nrows=3, ncols=1, figsize=(16, fig_height), sharex=True,
        gridspec_kw={
            "height_ratios": [mean_height / fig_height, heatmap_height / fig_height, heatmap_height / fig_height]
        }
    )

    # Plot the average profiles
    ax[0].plot(true_profs_mean[:, 0], color="darkslateblue")
    ax[0].plot(-true_profs_mean[:, 1], color="darkorange")

    # Set axes on average profiles
    max_mean_val = np.max(true_profs_mean)
    mean_ylim = max_mean_val * 1.05  # Make 5% higher
    ax[0].set_title("Observed (experimental) profiles")
    ax[0].set_ylabel("Average probability")
    ax[0].set_ylim(-mean_ylim, mean_ylim)
    ax[0].label_outer()

    # Plot the heatmaps
    ax[1].imshow(true_matrix[:, :, 0], interpolation="nearest", aspect="auto", cmap="Blues")
    ax[2].imshow(true_matrix[:, :, 1], interpolation="nearest", aspect="auto", cmap="Oranges")

    # Set axes on heatmaps
    for i in (1, 2):
        ax[i].set_yticks([])
        ax[i].set_yticklabels([])
        ax[i].label_outer()
    width = true_matrix.shape[1]
    delta = 100
    num_deltas = (width // 2) // delta
    labels = list(range(max(-width // 2, -num_deltas * delta), min(width // 2, num_deltas * delta) + 1, delta))
    tick_locs = [label + max(width // 2, num_deltas * delta) for label in labels]
    ax[2].set_xticks(tick_locs)
    ax[2].set_xticklabels(labels)
    ax[2].set_xlabel("Distance from peak summit (bp)")

    fig.tight_layout()
    plt.show()
In [11]:
def get_summit_distances(coords, peak_table):
    """
    Given a set of coordinates, computes the distance of the center of each
    coordinate to the nearest summit.
    Arguments:
        `coords`: an N x 3 object array of coordinates
        `peak_table`: a 6-column table of peak data, as imported by
            `import_peak_table`
    Returns and N-array of integers, which is the distance of each coordinate
    midpoint to the nearest coordinate.
    """
    chroms = coords[:, 0]
    midpoints = (coords[:, 1] + coords[:, 2]) // 2
    dists = []
    for i in range(len(coords)):
        chrom = chroms[i]
        midpoint = midpoints[i]
        rows = peak_table[peak_table["chrom"] == chrom]
        dist_arr = (midpoint - rows["summit"]).values
        min_dist = dist_arr[np.argmin(np.abs(dist_arr))]
        dists.append(min_dist)
    return np.array(dists)
In [12]:
def plot_summit_dists(summit_dists):
    """
    Plots the distribution of seqlet distances to summits.
    Arguments:
        `summit_dists`: the array of distances as returned by
            `get_summit_distances`
    """
    plt.figure(figsize=(8, 6))
    num_bins = max(len(summit_dists) // 30, 20)
    plt.hist(summit_dists, bins=num_bins, color="purple")
    plt.title("Histogram of distance of seqlets to peak summits")
    plt.xlabel("Signed distance from seqlet center to nearest peak summit (bp)")
    plt.show()
In [13]:
background_freqs = np.array([0.27, 0.23, 0.23, 0.27])
def info_content(track, pseudocount=0.001):
    """
    Given an L x 4 track, computes information content for each base and
    returns it as an L-array.
    """
    num_bases = track.shape[1]
    # Normalize track to probabilities along base axis
    track_norm = (track + pseudocount) / (np.sum(track, axis=1, keepdims=True) + (num_bases * pseudocount))
    ic = track_norm * np.log2(track_norm / np.expand_dims(background_freqs, axis=0))
    return np.sum(ic, axis=1)

Import importance scores and TF-MoDISco results

In [14]:
# Import SHAP coordinates and one-hot sequences
hyp_scores, one_hot_seqs, shap_coords = import_shap_scores(shap_scores_path)

# Import the set of peaks
peak_table = import_peak_table(peak_bed_path)
In [15]:
# Import the TF-MoDISco results object
tfm_obj = import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)

Plot some SHAP score tracks

Plot the central region of some randomly selected actual importance scores

In [16]:
for index in np.random.choice(hyp_scores.shape[0], size=5, replace=False):
    viz_sequence.plot_weights((hyp_scores[index] * one_hot_seqs[index])[570:770], subticks_frequency=100)

Plot TF-MoDISco results

In [17]:
motifs = []  # Save the motifs as trimmed CWMs
num_seqlets = []  # Number of seqlets for each motif
motif_seqlets = {}  # Save seqlets of each motif
metaclusters = tfm_obj.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
    metacluster = metaclusters[metacluster_key]
    print("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters))
    print("==========================================")
    patterns = metacluster.seqlets_to_patterns_result.patterns
    if not patterns:
        break
    num_patterns = len(patterns)
    for pattern_i, pattern in enumerate(patterns):
        seqlets = pattern.seqlets
        print("Pattern %d/%d" % (pattern_i + 1, num_patterns))
        print("--------------------------------------")
        
        pfm = pattern["sequence"].fwd
        hcwm = pattern["task0_hypothetical_contribs"].fwd
        cwm = pattern["task0_contrib_scores"].fwd
        print("%d seqlets" % len(seqlets))
        print("Sequence")
        viz_sequence.plot_weights(pfm, subticks_frequency=10)
        print("Hypothetical contributions")
        viz_sequence.plot_weights(hcwm, subticks_frequency=10)
        print("Actual contributions (CWM)")
        viz_sequence.plot_weights(cwm, subticks_frequency=10)
        
        # Trim motif based on IC
        pfm_ic = info_content(pfm)
        ic_trim_thresh = np.max(pfm_ic) * 0.2  # Cut off anything less than 20% of max IC
        pass_inds = np.where(pfm_ic >= ic_trim_thresh)[0]
        trimmed_cwm = cwm[np.min(pass_inds): np.max(pass_inds) + 1]
        motifs.append(trimmed_cwm)
        num_seqlets.append(len(seqlets))
        
        # Get the importance scores, input sequences, and coordinates of underlying seqlets
        seqlet_seqs, seqlet_hyps, seqlet_coords = extract_coords(
            seqlets, shap_coords, one_hot_seqs, hyp_scores, input_length, shap_score_center_size
        )
        
        key_pair = (metacluster_key, pattern_i)
        motif_seqlets[key_pair] = (seqlet_seqs, seqlet_hyps)

        assert np.allclose(np.sum(seqlet_seqs, axis=0) / len(seqlet_seqs), pattern["sequence"].fwd)
        # ^Sanity check: PFM derived from seqlets match the PFM stored in the pattern
        
        # Get the set of profiles
        seqlet_true_profs = import_true_profiles(profile_hdf5_path, seqlet_coords, task_index)
            
        print("Observed (experimental) profiles")
        plot_profiles(seqlet_true_profs)
        
        print("Seqlet distance from summits")
        summit_dists = get_summit_distances(seqlet_coords, peak_table)
        plot_summit_dists(summit_dists)
Metacluster 1/1
==========================================
Pattern 1/9
--------------------------------------
1755 seqlets
Sequence
Hypothetical contributions
Actual contributions (CWM)
Observed (experimental) profiles
Seqlet distance from summits
Pattern 2/9
--------------------------------------
1103 seqlets
Sequence
Hypothetical contributions
Actual contributions (CWM)
Observed (experimental) profiles
Seqlet distance from summits
Pattern 3/9
--------------------------------------
685 seqlets
Sequence
Hypothetical contributions
Actual contributions (CWM)
Observed (experimental) profiles
Seqlet distance from summits
Pattern 4/9
--------------------------------------
673 seqlets
Sequence
Hypothetical contributions
Actual contributions (CWM)
Observed (experimental) profiles
Seqlet distance from summits
Pattern 5/9
--------------------------------------
497 seqlets
Sequence
Hypothetical contributions
Actual contributions (CWM)
Observed (experimental) profiles
Seqlet distance from summits
Pattern 6/9
--------------------------------------
387 seqlets
Sequence
Hypothetical contributions
Actual contributions (CWM)
Observed (experimental) profiles
Seqlet distance from summits
Pattern 7/9
--------------------------------------
195 seqlets
Sequence
Hypothetical contributions
Actual contributions (CWM)
Observed (experimental) profiles
Seqlet distance from summits
Pattern 8/9
--------------------------------------
172 seqlets
Sequence
Hypothetical contributions
Actual contributions (CWM)
Observed (experimental) profiles
Seqlet distance from summits
Pattern 9/9
--------------------------------------
176 seqlets
Sequence
Hypothetical contributions
Actual contributions (CWM)
Observed (experimental) profiles
Seqlet distance from summits

Summary of motifs

Motifs are trimmed based on information content, and presented in descending order by number of supporting seqlets

In [18]:
for i in np.flip(np.argsort(num_seqlets)):
    print("Number of seqlets: %d" % num_seqlets[i])
    viz_sequence.plot_weights(motifs[i])
Number of seqlets: 1755
Number of seqlets: 1103
Number of seqlets: 685
Number of seqlets: 673
Number of seqlets: 497
Number of seqlets: 387
Number of seqlets: 195
Number of seqlets: 176
Number of seqlets: 172

Sample of seqlets supporting each motif

In [19]:
num_seqlets_to_show = 20
metaclusters = tfm_obj.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
    metacluster = metaclusters[metacluster_key]
    print("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters))
    print("==========================================")
    patterns = metacluster.seqlets_to_patterns_result.patterns
    if not patterns:
        break
    num_patterns = len(patterns)
    for pattern_i, pattern in enumerate(patterns):
        seqlets = pattern.seqlets
        print("Pattern %d/%d" % (pattern_i + 1, num_patterns))
        print("--------------------------------------")
        print("Actual contributions (CWM)")
        viz_sequence.plot_weights(pattern["task0_contrib_scores"].fwd, subticks_frequency=10)
        
        key_pair = (metacluster_key, pattern_i)
        seqlet_seqs, seqlet_hyps = motif_seqlets[key_pair]
        
        print("Sample of seqlets")
        sample_size = min(num_seqlets_to_show, len(seqlet_seqs))
        sample_inds = np.random.choice(len(seqlet_seqs), size=sample_size, replace=False)
        for i in sample_inds:
            viz_sequence.plot_weights(seqlet_seqs[i] * seqlet_hyps[i], subticks_frequency=10)
Metacluster 1/1
==========================================
Pattern 1/9
--------------------------------------
Actual contributions (CWM)
Sample of seqlets
Pattern 2/9
--------------------------------------
Actual contributions (CWM)
Sample of seqlets
Pattern 3/9
--------------------------------------
Actual contributions (CWM)
Sample of seqlets
Pattern 4/9
--------------------------------------
Actual contributions (CWM)
Sample of seqlets
Pattern 5/9
--------------------------------------
Actual contributions (CWM)
Sample of seqlets
Pattern 6/9
--------------------------------------
Actual contributions (CWM)
Sample of seqlets
Pattern 7/9
--------------------------------------
Actual contributions (CWM)
Sample of seqlets
Pattern 8/9
--------------------------------------
Actual contributions (CWM)
Sample of seqlets
Pattern 9/9
--------------------------------------
Actual contributions (CWM)
Sample of seqlets