import os
import util
import viz_sequence
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
tqdm.tqdm_notebook()

/users/vir/miniconda2/envs/basepairmodels_latest/lib/python3.7/site-packages/ipykernel_launcher.py:11: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()

<tqdm.notebook.tqdm_notebook at 0x7fe8a106e1d0>


# Plotting defaults
font_manager.fontManager.ttflist.extend(
    font_manager.createFontList(
        font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
    )
)
plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.family": "Roboto",
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)

/users/vir/miniconda2/envs/basepairmodels_latest/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: 
The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.
  after removing the cwd from sys.path.


# Define parameters/fetch arguments
preds_path = os.environ["TFM_PRED_PATH"]
shap_scores_path = os.environ["TFM_SHAP_PATH"]
tfm_results_path = os.environ["TFM_TFM_PATH"]

print("Predictions path: %s" % preds_path)
print("DeepSHAP scores path: %s" % shap_scores_path)
print("TF-MoDISco results path: %s" % tfm_results_path)

Predictions path: /mnt/lab_data2/vir/tf_chr_atlas/02-24-2021//predictions/ENCSR000EDZ/profile_predictions.h5
DeepSHAP scores path: /mnt/lab_data2/vir/tf_chr_atlas/02-24-2021//shap/ENCSR000EDZ/profile_scores_alex_format.h5
TF-MoDISco results path: /mnt/lab_data2/vir/tf_chr_atlas/02-24-2021//modisco/ENCSR000EDZ/profile/modisco_results.hd5


# Define constants
input_length, profile_length = 2114, 1000
shap_score_center_size = 400
hyp_score_key = "hyp_scores"
task_index = None


def plot_profiles(profs, title=None, return_fig=False):
    """
    Plots the given profiles as a signal track.
    It should be a T x O x 2 NumPy array, where the subarrays are the
    tracks for the plus and minus strand, for each task. No normalization is
    performed prior to plotting.
    """
    assert len(profs.shape) == 3
    num_tasks, prof_length, _ = profs.shape
    fig, ax = plt.subplots(num_tasks, figsize=(20, num_tasks * 3 * 2))
    if num_tasks == 1:
        ax = [ax]
    for i in range(num_tasks):
        ax[i].plot(profs[i,:,0], color="royalblue")
        ax[i].plot(-profs[i,:,1], color="goldenrod")
    if title:
        fig.suptitle(title)
    if return_fig:
        return fig
    plt.show()


# Import SHAP coordinates and one-hot sequences
hyp_scores, _, one_hot_seqs, shap_coords = util.import_shap_scores(shap_scores_path, hyp_score_key, center_cut_size=shap_score_center_size, remove_non_acgt=False)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut

Importing SHAP scores:   0%|          | 0/47 [00:00<?, ?it/s]
Importing SHAP scores:   2%|▏         | 1/47 [00:02<02:04,  2.70s/it]
Importing SHAP scores:   4%|▍         | 2/47 [00:04<01:42,  2.27s/it]
Importing SHAP scores:   6%|▋         | 3/47 [00:06<01:23,  1.90s/it]
Importing SHAP scores:   9%|▊         | 4/47 [00:07<01:18,  1.82s/it]
Importing SHAP scores:  11%|█         | 5/47 [00:09<01:10,  1.69s/it]
Importing SHAP scores:  13%|█▎        | 6/47 [00:10<01:02,  1.51s/it]
Importing SHAP scores:  15%|█▍        | 7/47 [00:12<01:01,  1.53s/it]
Importing SHAP scores:  17%|█▋        | 8/47 [00:14<01:07,  1.72s/it]
Importing SHAP scores:  19%|█▉        | 9/47 [00:16<01:08,  1.80s/it]
Importing SHAP scores:  21%|██▏       | 10/47 [00:17<01:02,  1.70s/it]
Importing SHAP scores:  23%|██▎       | 11/47 [00:18<00:56,  1.57s/it]
Importing SHAP scores:  26%|██▌       | 12/47 [00:20<00:52,  1.51s/it]
Importing SHAP scores:  28%|██▊       | 13/47 [00:22<00:58,  1.72s/it]
Importing SHAP scores:  30%|██▉       | 14/47 [00:23<00:50,  1.52s/it]
Importing SHAP scores:  32%|███▏      | 15/47 [00:24<00:44,  1.41s/it]
Importing SHAP scores:  34%|███▍      | 16/47 [00:26<00:49,  1.60s/it]
Importing SHAP scores:  36%|███▌      | 17/47 [00:28<00:51,  1.71s/it]
Importing SHAP scores:  38%|███▊      | 18/47 [00:30<00:49,  1.71s/it]
Importing SHAP scores:  40%|████      | 19/47 [00:32<00:47,  1.71s/it]
Importing SHAP scores:  43%|████▎     | 20/47 [00:33<00:42,  1.59s/it]
Importing SHAP scores:  45%|████▍     | 21/47 [00:35<00:41,  1.61s/it]
Importing SHAP scores:  47%|████▋     | 22/47 [00:36<00:39,  1.60s/it]
Importing SHAP scores:  49%|████▉     | 23/47 [00:38<00:39,  1.66s/it]
Importing SHAP scores:  51%|█████     | 24/47 [00:40<00:38,  1.67s/it]
Importing SHAP scores:  53%|█████▎    | 25/47 [00:42<00:39,  1.80s/it]
Importing SHAP scores:  55%|█████▌    | 26/47 [00:43<00:32,  1.54s/it]
Importing SHAP scores:  57%|█████▋    | 27/47 [00:44<00:28,  1.43s/it]
Importing SHAP scores:  60%|█████▉    | 28/47 [00:45<00:24,  1.28s/it]
Importing SHAP scores:  62%|██████▏   | 29/47 [00:46<00:23,  1.30s/it]
Importing SHAP scores:  64%|██████▍   | 30/47 [00:47<00:21,  1.26s/it]
Importing SHAP scores:  66%|██████▌   | 31/47 [00:49<00:22,  1.42s/it]
Importing SHAP scores:  68%|██████▊   | 32/47 [00:50<00:18,  1.21s/it]
Importing SHAP scores:  70%|███████   | 33/47 [00:51<00:17,  1.23s/it]
Importing SHAP scores:  72%|███████▏  | 34/47 [00:51<00:11,  1.10it/s]
Importing SHAP scores:  74%|███████▍  | 35/47 [00:51<00:08,  1.43it/s]
Importing SHAP scores:  77%|███████▋  | 36/47 [00:52<00:05,  1.86it/s]
Importing SHAP scores:  79%|███████▊  | 37/47 [00:52<00:04,  2.27it/s]
Importing SHAP scores:  81%|████████  | 38/47 [00:52<00:03,  2.81it/s]
Importing SHAP scores:  83%|████████▎ | 39/47 [00:52<00:02,  3.38it/s]
Importing SHAP scores:  85%|████████▌ | 40/47 [00:52<00:01,  3.69it/s]
Importing SHAP scores:  87%|████████▋ | 41/47 [00:52<00:01,  4.22it/s]
Importing SHAP scores:  89%|████████▉ | 42/47 [00:53<00:01,  4.36it/s]
Importing SHAP scores:  91%|█████████▏| 43/47 [00:53<00:00,  4.83it/s]
Importing SHAP scores:  94%|█████████▎| 44/47 [00:53<00:00,  5.16it/s]
Importing SHAP scores:  96%|█████████▌| 45/47 [00:53<00:00,  4.99it/s]
Importing SHAP scores: 100%|██████████| 47/47 [00:53<00:00,  1.15s/it]


# Import long version of SHAP coordinates and one-hot sequences
hyp_scores_long, _, one_hot_seqs_long, shap_coords_long = util.import_shap_scores(shap_scores_path, hyp_score_key, center_cut_size=None, remove_non_acgt=False)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut

Importing SHAP scores:   0%|          | 0/47 [00:00<?, ?it/s]
Importing SHAP scores:   2%|▏         | 1/47 [00:00<00:35,  1.31it/s]
Importing SHAP scores:   4%|▍         | 2/47 [00:01<00:37,  1.21it/s]
Importing SHAP scores:   6%|▋         | 3/47 [00:03<00:49,  1.12s/it]
Importing SHAP scores:   9%|▊         | 4/47 [00:03<00:41,  1.03it/s]
Importing SHAP scores:  11%|█         | 5/47 [00:04<00:37,  1.13it/s]
Importing SHAP scores:  13%|█▎        | 6/47 [00:05<00:37,  1.09it/s]
Importing SHAP scores:  15%|█▍        | 7/47 [00:06<00:34,  1.17it/s]
Importing SHAP scores:  17%|█▋        | 8/47 [00:07<00:34,  1.12it/s]
Importing SHAP scores:  19%|█▉        | 9/47 [00:07<00:32,  1.19it/s]
Importing SHAP scores:  21%|██▏       | 10/47 [00:08<00:29,  1.24it/s]
Importing SHAP scores:  23%|██▎       | 11/47 [00:09<00:31,  1.16it/s]
Importing SHAP scores:  26%|██▌       | 12/47 [00:10<00:28,  1.21it/s]
Importing SHAP scores:  28%|██▊       | 13/47 [00:11<00:26,  1.26it/s]
Importing SHAP scores:  30%|██▉       | 14/47 [00:12<00:28,  1.17it/s]
Importing SHAP scores:  32%|███▏      | 15/47 [00:12<00:25,  1.23it/s]
Importing SHAP scores:  34%|███▍      | 16/47 [00:13<00:26,  1.16it/s]
Importing SHAP scores:  36%|███▌      | 17/47 [00:14<00:24,  1.22it/s]
Importing SHAP scores:  38%|███▊      | 18/47 [00:15<00:23,  1.26it/s]
Importing SHAP scores:  40%|████      | 19/47 [00:23<01:23,  2.99s/it]
Importing SHAP scores:  43%|████▎     | 20/47 [00:30<01:55,  4.27s/it]
Importing SHAP scores:  45%|████▍     | 21/47 [00:39<02:25,  5.60s/it]
Importing SHAP scores:  47%|████▋     | 22/47 [00:41<01:56,  4.66s/it]
Importing SHAP scores:  49%|████▉     | 23/47 [00:42<01:23,  3.48s/it]
Importing SHAP scores:  51%|█████     | 24/47 [00:43<01:02,  2.73s/it]
Importing SHAP scores:  53%|█████▎    | 25/47 [00:44<00:46,  2.13s/it]
Importing SHAP scores:  55%|█████▌    | 26/47 [00:44<00:35,  1.71s/it]
Importing SHAP scores:  57%|█████▋    | 27/47 [00:46<00:32,  1.61s/it]
Importing SHAP scores:  60%|█████▉    | 28/47 [00:47<00:25,  1.34s/it]
Importing SHAP scores:  62%|██████▏   | 29/47 [00:48<00:22,  1.24s/it]
Importing SHAP scores:  64%|██████▍   | 30/47 [00:48<00:18,  1.08s/it]
Importing SHAP scores:  66%|██████▌   | 31/47 [00:49<00:15,  1.02it/s]
Importing SHAP scores:  68%|██████▊   | 32/47 [00:50<00:14,  1.02it/s]
Importing SHAP scores:  70%|███████   | 33/47 [00:51<00:12,  1.10it/s]
Importing SHAP scores:  72%|███████▏  | 34/47 [00:51<00:11,  1.17it/s]
Importing SHAP scores:  74%|███████▍  | 35/47 [00:52<00:10,  1.11it/s]
Importing SHAP scores:  77%|███████▋  | 36/47 [00:53<00:09,  1.17it/s]
Importing SHAP scores:  79%|███████▊  | 37/47 [00:54<00:08,  1.12it/s]
Importing SHAP scores:  81%|████████  | 38/47 [00:55<00:07,  1.18it/s]
Importing SHAP scores:  83%|████████▎ | 39/47 [00:56<00:06,  1.22it/s]
Importing SHAP scores:  85%|████████▌ | 40/47 [00:57<00:06,  1.11it/s]
Importing SHAP scores:  87%|████████▋ | 41/47 [00:58<00:05,  1.17it/s]
Importing SHAP scores:  89%|████████▉ | 42/47 [00:59<00:04,  1.11it/s]
Importing SHAP scores:  91%|█████████▏| 43/47 [01:06<00:10,  2.75s/it]
Importing SHAP scores:  94%|█████████▎| 44/47 [01:15<00:13,  4.64s/it]
Importing SHAP scores:  96%|█████████▌| 45/47 [01:24<00:12,  6.07s/it]
Importing SHAP scores:  98%|█████████▊| 46/47 [01:28<00:05,  5.52s/it]
Importing SHAP scores: 100%|██████████| 47/47 [01:29<00:00,  1.91s/it]


# Subset the long SHAP data to the SHAP data that was used to run TF-MoDISco
shap_coords_table = pd.DataFrame(shap_coords, columns=["chrom", "start", "end"])
shap_coords_long_table = pd.DataFrame(shap_coords_long, columns=["chrom", "start", "end"])

subset_inds = shap_coords_long_table.reset_index().drop_duplicates(["chrom", "start", "end"]).merge(
    shap_coords_table.reset_index(), on=["chrom", "start", "end"]
).sort_values("index_y")["index_x"].values

hyp_scores_long = hyp_scores_long[subset_inds]
one_hot_seqs_long = one_hot_seqs_long[subset_inds]
shap_coords_long = shap_coords_long[subset_inds]

# Make sure the coordinates all match
assert np.all(shap_coords_long == shap_coords)


# Import the set of all profiles and their coordinates
true_profs, pred_profs, all_pred_coords = util.import_profiles(preds_path)


# Subset the predicted profiles/coordinates to the task-specific SHAP coordinates/scores
pred_coords_table = pd.DataFrame(all_pred_coords, columns=["chrom", "start", "end"])

subset_inds = pred_coords_table.reset_index().drop_duplicates(["chrom", "start", "end"]).merge(
    shap_coords_table.reset_index(), on=["chrom", "start", "end"]
).sort_values("index_y")["index_x"].values

true_profs = true_profs[subset_inds]
pred_profs = pred_profs[subset_inds]
pred_coords = all_pred_coords[subset_inds]

# Make sure the coordinates all match
assert np.all(pred_coords == shap_coords)


# Import the TF-MoDISco results object
tfm_obj = util.import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)


num_seqlets_to_show = 3
sizes_to_show = [1000, 400]

metaclusters = tfm_obj.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
    metacluster = metaclusters[metacluster_key]
    display(vdomh.h3("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters)))
    patterns = metacluster.seqlets_to_patterns_result.patterns
    if not patterns:
        break
    num_patterns = len(patterns)
    for pattern_i, pattern in enumerate(patterns):
        seqlets = pattern.seqlets
        display(vdomh.h4("Pattern %d/%d" % (pattern_i + 1, num_patterns)))
        display(vdomh.p("%d seqlets" % len(seqlets)))
        
        pfm = pattern["sequence"].fwd
        hcwm = pattern["task0_hypothetical_contribs"].fwd
        cwm = pattern["task0_contrib_scores"].fwd
        
        # Trim motif based on information content
        ic = util.info_content(pfm)
        pass_inds = np.where(ic >= 0.2)[0]  # Cut off flanks with less than 0.2 IC
        trimmed_hcwm = hcwm[max(0, np.min(pass_inds) - 4): min(len(pfm), np.max(pass_inds) + 4 + 1)]
        
        viz_sequence.plot_weights(trimmed_hcwm, figsize=(20, 4), subticks_frequency=(len(trimmed_hcwm) + 1))
        
        # Pick some random seqlets to show
        for seqlet_i in np.random.choice(len(seqlets), size=num_seqlets_to_show, replace=False):
            seqlet = seqlets[seqlet_i]
            coord_index = seqlet.coor.example_idx
            seqlet_start = seqlet.coor.start
            seqlet_end = seqlet.coor.end
            seqlet_rc = seqlet.coor.is_revcomp
            
            hyp = hyp_scores_long[coord_index]
            seq = one_hot_seqs_long[coord_index]
            seqlet_seq = one_hot_seqs[coord_index, seqlet_start:seqlet_end]
            seqlet_hyp = hyp_scores[coord_index, seqlet_start:seqlet_end]
            
            true_prof = true_profs[coord_index]
            pred_prof = pred_profs[coord_index]
            true_prof_fig = plot_profiles(true_prof, return_fig=True)
            pred_prof_fig = plot_profiles(pred_prof, return_fig=True)
            true_prof_fig.tight_layout()
            pred_prof_fig.tight_layout()
            
            table_rows = [
              vdomh.tr(
                    vdomh.td("Observed profiles"),
                    vdomh.td(util.figure_to_vdom_image(true_prof_fig))
                ),
                vdomh.tr(
                    vdomh.td("Predicted profiles"),
                    vdomh.td(util.figure_to_vdom_image(pred_prof_fig))
                )  
            ]
            
            for size in sizes_to_show:
                start = (input_length // 2) - (size // 2)
                end = start + size
                fig = viz_sequence.plot_weights(hyp[start:end] * seq[start:end], subticks_frequency=(size + 1), return_fig=True)
                fig.tight_layout()
                table_rows.append(
                    vdomh.tr(
                        vdomh.td("Importance scores (%d bp)" % size),
                        vdomh.td(util.figure_to_vdom_image(fig))
                    )
                )
            fig = viz_sequence.plot_weights(seqlet_hyp * seqlet_seq, subticks_frequency=(len(seqlet_hyp) + 1), return_fig=True)
            fig.tight_layout()
            table_rows.append(
                vdomh.tr(
                    vdomh.td("Seqlet"),
                    vdomh.td(util.figure_to_vdom_image(fig))
                )
            )
            table = vdomh.table(*table_rows)
            display(table)
            plt.close("all")

Link to results¶

Define constants and paths¶

Helper functions¶

Import SHAP scores, profile predictions, and TF-MoDISco results¶

Show motifs and profiles¶

Metacluster 1/2

Pattern 1/7

Pattern 2/7

Pattern 3/7

Pattern 4/7

Pattern 5/7

Pattern 6/7

Pattern 7/7

Metacluster 2/2

Pattern 1/2

Pattern 2/2

Observed profiles
Predicted profiles
Importance scores (1000 bp)
Importance scores (400 bp)
Seqlet