Results

In [1]:
import os
import util
import viz_sequence
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
tqdm.tqdm_notebook()
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:11: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()
Out[1]:
0it [00:00, ?it/s]
In [2]:
# Plotting defaults
font_manager.fontManager.ttflist.extend(
    font_manager.createFontList(
        font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
    )
)
plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.family": "Roboto",
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: 
The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.
  after removing the cwd from sys.path.

Define constants and paths

In [3]:
# Define parameters/fetch arguments
preds_path = os.environ["TFM_PRED_PATH"]
shap_scores_path = os.environ["TFM_SHAP_PATH"]
tfm_results_path = os.environ["TFM_TFM_PATH"]

print("Predictions path: %s" % preds_path)
print("DeepSHAP scores path: %s" % shap_scores_path)
print("TF-MoDISco results path: %s" % tfm_results_path)
Predictions path: /users/zahoor/TF-Atlas/02-16-2021/predictions/ENCSR000FAH/profile_predictions.h5
DeepSHAP scores path: /users/zahoor/TF-Atlas/02-16-2021/shap/ENCSR000FAH/profile_scores_alex_format.h5
TF-MoDISco results path: /users/zahoor/TF-Atlas/02-16-2021/modisco/ENCSR000FAH/profile/modisco_results.hd5
In [4]:
# Define constants
input_length, profile_length = 2114, 1000
shap_score_center_size = 400
hyp_score_key = "hyp_scores"
task_index = None

Helper functions

For plotting and organizing things

In [5]:
def plot_profiles(profs, title=None, return_fig=False):
    """
    Plots the given profiles as a signal track.
    It should be a T x O x 2 NumPy array, where the subarrays are the
    tracks for the plus and minus strand, for each task. No normalization is
    performed prior to plotting.
    """
    assert len(profs.shape) == 3
    num_tasks, prof_length, _ = profs.shape
    fig, ax = plt.subplots(num_tasks, figsize=(20, num_tasks * 3 * 2))
    if num_tasks == 1:
        ax = [ax]
    for i in range(num_tasks):
        ax[i].plot(profs[i,:,0], color="royalblue")
        ax[i].plot(-profs[i,:,1], color="goldenrod")
    if title:
        fig.suptitle(title)
    if return_fig:
        return fig
    plt.show()

Import SHAP scores, profile predictions, and TF-MoDISco results

In [6]:
# Import SHAP coordinates and one-hot sequences
hyp_scores, _, one_hot_seqs, shap_coords = util.import_shap_scores(shap_scores_path, hyp_score_key, center_cut_size=shap_score_center_size, remove_non_acgt=False)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut
Importing SHAP scores: 100%|██████████| 12/12 [00:01<00:00,  6.92it/s]
In [7]:
# Import long version of SHAP coordinates and one-hot sequences
hyp_scores_long, _, one_hot_seqs_long, shap_coords_long = util.import_shap_scores(shap_scores_path, hyp_score_key, center_cut_size=None, remove_non_acgt=False)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut
Importing SHAP scores: 100%|██████████| 12/12 [00:17<00:00,  1.42s/it]
In [8]:
# Subset the long SHAP data to the SHAP data that was used to run TF-MoDISco
shap_coords_table = pd.DataFrame(shap_coords, columns=["chrom", "start", "end"])
shap_coords_long_table = pd.DataFrame(shap_coords_long, columns=["chrom", "start", "end"])

subset_inds = shap_coords_long_table.reset_index().drop_duplicates(["chrom", "start", "end"]).merge(
    shap_coords_table.reset_index(), on=["chrom", "start", "end"]
).sort_values("index_y")["index_x"].values

hyp_scores_long = hyp_scores_long[subset_inds]
one_hot_seqs_long = one_hot_seqs_long[subset_inds]
shap_coords_long = shap_coords_long[subset_inds]

# Make sure the coordinates all match
assert np.all(shap_coords_long == shap_coords)
In [9]:
# Import the set of all profiles and their coordinates
true_profs, pred_profs, all_pred_coords = util.import_profiles(preds_path)

In [10]:
# Subset the predicted profiles/coordinates to the task-specific SHAP coordinates/scores
pred_coords_table = pd.DataFrame(all_pred_coords, columns=["chrom", "start", "end"])

subset_inds = pred_coords_table.reset_index().drop_duplicates(["chrom", "start", "end"]).merge(
    shap_coords_table.reset_index(), on=["chrom", "start", "end"]
).sort_values("index_y")["index_x"].values

true_profs = true_profs[subset_inds]
pred_profs = pred_profs[subset_inds]
pred_coords = all_pred_coords[subset_inds]

# Make sure the coordinates all match
assert np.all(pred_coords == shap_coords)
In [11]:
# Import the TF-MoDISco results object
tfm_obj = util.import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)

Show motifs and profiles

For each motif, show:

  1. The motif
  2. Examples of the underlying seqlet
  3. The observed/predicted profiles of that underlying seqlet
In [12]:
num_seqlets_to_show = 3
sizes_to_show = [1000, 400]

metaclusters = tfm_obj.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
    metacluster = metaclusters[metacluster_key]
    display(vdomh.h3("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters)))
    patterns = metacluster.seqlets_to_patterns_result.patterns
    if not patterns:
        break
    num_patterns = len(patterns)
    for pattern_i, pattern in enumerate(patterns):
        seqlets = pattern.seqlets
        display(vdomh.h4("Pattern %d/%d" % (pattern_i + 1, num_patterns)))
        display(vdomh.p("%d seqlets" % len(seqlets)))
        
        hcwm = pattern["task0_hypothetical_contribs"].fwd
        cwm = pattern["task0_contrib_scores"].fwd
        
        # Trim motif based on total contribution
        score = np.sum(np.abs(cwm), axis=1)
        trim_thresh = np.max(score) * 0.2  # Cut off anything less than 20% of max score
        pass_inds = np.where(score >= trim_thresh)[0]
        trimmed_hcwm = hcwm[max(0, np.min(pass_inds) - 4): min(len(cwm), np.max(pass_inds) + 4 + 1)]
        
        viz_sequence.plot_weights(trimmed_hcwm, figsize=(20, 4), subticks_frequency=(len(trimmed_hcwm) + 1))
        
        # Pick some random seqlets to show
        for seqlet_i in np.random.choice(len(seqlets), size=num_seqlets_to_show, replace=False):
            seqlet = seqlets[seqlet_i]
            coord_index = seqlet.coor.example_idx
            seqlet_start = seqlet.coor.start
            seqlet_end = seqlet.coor.end
            seqlet_rc = seqlet.coor.is_revcomp
            
            hyp = hyp_scores_long[coord_index]
            seq = one_hot_seqs_long[coord_index]
            seqlet_seq = one_hot_seqs[coord_index, seqlet_start:seqlet_end]
            seqlet_hyp = hyp_scores[coord_index, seqlet_start:seqlet_end]
            
            true_prof = true_profs[coord_index]
            pred_prof = pred_profs[coord_index]
            true_prof_fig = plot_profiles(true_prof, return_fig=True)
            pred_prof_fig = plot_profiles(pred_prof, return_fig=True)
            true_prof_fig.tight_layout()
            pred_prof_fig.tight_layout()
            
            table_rows = [
              vdomh.tr(
                    vdomh.td("Observed profiles"),
                    vdomh.td(util.figure_to_vdom_image(true_prof_fig))
                ),
                vdomh.tr(
                    vdomh.td("Predicted profiles"),
                    vdomh.td(util.figure_to_vdom_image(pred_prof_fig))
                )  
            ]
            
            for size in sizes_to_show:
                start = (input_length // 2) - (size // 2)
                end = start + size
                fig = viz_sequence.plot_weights(hyp[start:end] * seq[start:end], subticks_frequency=(size + 1), return_fig=True)
                fig.tight_layout()
                table_rows.append(
                    vdomh.tr(
                        vdomh.td("Importance scores (%d bp)" % size),
                        vdomh.td(util.figure_to_vdom_image(fig))
                    )
                )
            fig = viz_sequence.plot_weights(seqlet_hyp * seqlet_seq, subticks_frequency=(len(seqlet_hyp) + 1), return_fig=True)
            fig.tight_layout()
            table_rows.append(
                vdomh.tr(
                    vdomh.td("Seqlet"),
                    vdomh.td(util.figure_to_vdom_image(fig))
                )
            )
            table = vdomh.table(*table_rows)
            display(table)
            plt.close("all")

Metacluster 1/2

Pattern 1/7

5933 seqlets

Observed profiles
Predicted profiles
Importance scores (1000 bp)
Importance scores (400 bp)
Seqlet
Observed profiles
Predicted profiles
Importance scores (1000 bp)
Importance scores (400 bp)
Seqlet
Observed profiles
Predicted profiles
Importance scores (1000 bp)
Importance scores (400 bp)
Seqlet

Pattern 2/7

2488 seqlets

Observed profiles
Predicted profiles
Importance scores (1000 bp)
Importance scores (400 bp)
Seqlet
Observed profiles
Predicted profiles
Importance scores (1000 bp)
Importance scores (400 bp)
Seqlet
Observed profiles
Predicted profiles
Importance scores (1000 bp)
Importance scores (400 bp)
Seqlet

Pattern 3/7

1198 seqlets

Observed profiles
Predicted profiles
Importance scores (1000 bp)
Importance scores (400 bp)
Seqlet
Observed profiles
Predicted profiles
Importance scores (1000 bp)
Importance scores (400 bp)
Seqlet
Observed profiles
Predicted profiles
Importance scores (1000 bp)
Importance scores (400 bp)
Seqlet

Pattern 4/7

673 seqlets

Observed profiles
Predicted profiles
Importance scores (1000 bp)
Importance scores (400 bp)
Seqlet
Observed profiles
Predicted profiles
Importance scores (1000 bp)
Importance scores (400 bp)
Seqlet
Observed profiles
Predicted profiles
Importance scores (1000 bp)
Importance scores (400 bp)
Seqlet

Pattern 5/7

544 seqlets

Observed profiles
Predicted profiles
Importance scores (1000 bp)
Importance scores (400 bp)
Seqlet
Observed profiles
Predicted profiles
Importance scores (1000 bp)
Importance scores (400 bp)
Seqlet
Observed profiles
Predicted profiles
Importance scores (1000 bp)
Importance scores (400 bp)
Seqlet

Pattern 6/7

474 seqlets

Observed profiles
Predicted profiles
Importance scores (1000 bp)
Importance scores (400 bp)
Seqlet
Observed profiles
Predicted profiles
Importance scores (1000 bp)
Importance scores (400 bp)
Seqlet
Observed profiles
Predicted profiles
Importance scores (1000 bp)
Importance scores (400 bp)
Seqlet

Pattern 7/7

248 seqlets

Observed profiles
Predicted profiles
Importance scores (1000 bp)
Importance scores (400 bp)
Seqlet
Observed profiles
Predicted profiles
Importance scores (1000 bp)
Importance scores (400 bp)
Seqlet
Observed profiles
Predicted profiles
Importance scores (1000 bp)
Importance scores (400 bp)
Seqlet

Metacluster 2/2