In [1]:
import sys
import os
sys.path.append(os.path.abspath("../src/"))
import model.train_profile_model as train_profile_model
import extract.data_loading as data_loading
import keras
import numpy as np
import pandas as pd
import sklearn.decomposition
import collections
import json
import h5py
import umap
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import tqdm
tqdm.tqdm_notebook()
Using TensorFlow backend.
/users/amtseng/miniconda3/envs/tfmodisco/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
/users/amtseng/miniconda3/envs/tfmodisco/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
/users/amtseng/miniconda3/envs/tfmodisco/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
/users/amtseng/miniconda3/envs/tfmodisco/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
/users/amtseng/miniconda3/envs/tfmodisco/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
/users/amtseng/miniconda3/envs/tfmodisco/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  np_resource = np.dtype([("resource", np.ubyte, 1)])
/users/amtseng/miniconda3/envs/tfmodisco/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
/users/amtseng/miniconda3/envs/tfmodisco/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
/users/amtseng/miniconda3/envs/tfmodisco/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
/users/amtseng/miniconda3/envs/tfmodisco/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
/users/amtseng/miniconda3/envs/tfmodisco/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
/users/amtseng/miniconda3/envs/tfmodisco/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  np_resource = np.dtype([("resource", np.ubyte, 1)])
/users/amtseng/miniconda3/envs/tfmodisco/lib/python3.7/site-packages/ipykernel_launcher.py:17: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
Out[1]:
0it [00:00, ?it/s]
In [2]:
# Plotting defaults
font_manager.fontManager.ttflist.extend(
    font_manager.createFontList(
        font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
    )
)
plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.family": "Roboto",
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)
/users/amtseng/miniconda3/envs/tfmodisco/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: 
The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.
  after removing the cwd from sys.path.
In [3]:
# # Constants
# files_spec_path = "/users/amtseng/gata1/data/processed/cutnrun/config/gata1_training_paths.json"
# model_path = "/users/amtseng/gata1/models/trained_models/cutnrun/1/model_ckpt_epoch_9.h5"
# num_tasks = 8
# use_controls = False

# reference_fasta = "/users/amtseng/genomes/hg38.fasta"
# chrom_sizes_tsv = "/users/amtseng/genomes/hg38.canon.chrom.sizes"

# diff_peaks_base = "/users/amtseng/gata1/data/processed/cutnrun/deseq"
# diff_peaks_paths = [
#     os.path.join(diff_peaks_base, name) for name in [
#         "gata1_patient-day6_up-peaks.bed",
#         "gata1_patient-day12_up-peaks.bed",
#         "gata1_wildtype-day6_up-peaks.bed",
#         "gata1_wildtype-day12_up-peaks.bed"
#     ]
# ]
# diff_peaks_names = [
#     "patient_day6_diff", "patient_day12_diff", "wildtype_day6_diff", "wildtype_day12_diff"
# ]

# emb_outfile = "/users/amtseng/gata1/results/embeddings/cutnrun/diff_peaks_embedding_umap2.h5"

# input_length = 1346
# profile_length = 1000

# with open(chrom_sizes_tsv, "r") as f:
#     canon_chroms = [line.split("\t")[0] for line in f]
In [4]:
# Constants
files_spec_path = "/users/amtseng/gata1/data/processed/chipseq/config/gata1_training_paths.json"
model_path = "/users/amtseng/gata1/models/trained_models/chipseq/4/model_ckpt_epoch_7.h5"
num_tasks = 3
use_controls = True

reference_fasta = "/users/amtseng/genomes/mm10.fasta"
chrom_sizes_tsv = "/users/amtseng/genomes/mm10.canon.chrom.sizes"

# diff_peaks_base = "/users/amtseng/gata1/data/processed/chipseq/deseq"
# diff_peaks_paths = [
#     os.path.join(diff_peaks_base, name) for name in [
#         "gata1_patient-307C_up-peaks.bed",
#         "gata1_patient-307H_up-peaks.bed",
#         "gata1_patient-bothmuts_up-peaks.bed",
#         "gata1_wildtype_up-peaks.bed"
#     ]
# ]
# diff_peaks_names = [
#     "patient_307C_diff", "patient_307H_diff", "patient_both_diff", "wildtype_diff"
# ]

diff_peaks_base = "/users/amtseng/gata1/data/processed/chipseq/labels"
diff_peaks_paths = [
    os.path.join(diff_peaks_base, name) for name in [
        "gata1_patient-307C_idr-peaks.bed.gz",
        "gata1_patient-307H_idr-peaks.bed.gz",
        "gata1_wildtype_idr-peaks.bed.gz"
    ]
]
diff_peaks_names = [
    "patient_307C", "patient_307H", "wildtype_diff"
]

emb_outfile = "/users/amtseng/gata1/results/embeddings/chipseq/all_peaks_embedding_umap.h5"

input_length = 1346
profile_length = 1000

with open(chrom_sizes_tsv, "r") as f:
    canon_chroms = [line.split("\t")[0] for line in f]
In [5]:
# Import the model
custom_objects = {
    "kb": keras.backend,
    "profile_loss": train_profile_model.get_profile_loss_function(num_tasks, profile_length),
    "count_loss": train_profile_model.get_count_loss_function(num_tasks)
}
model = keras.models.load_model(model_path, custom_objects=custom_objects)
WARNING:tensorflow:From /users/amtseng/miniconda3/envs/tfmodisco/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:541: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

WARNING:tensorflow:From /users/amtseng/miniconda3/envs/tfmodisco/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4432: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

WARNING:tensorflow:From /users/amtseng/miniconda3/envs/tfmodisco/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:190: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

WARNING:tensorflow:From /users/amtseng/miniconda3/envs/tfmodisco/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:197: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.

WARNING:tensorflow:From /users/amtseng/miniconda3/envs/tfmodisco/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:203: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.

WARNING:tensorflow:From /users/amtseng/miniconda3/envs/tfmodisco/lib/python3.7/site-packages/keras/optimizers.py:793: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

WARNING:tensorflow:From /users/amtseng/gata1/src/model/profile_models.py:253: calling reduce_logsumexp_v1 (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
WARNING:tensorflow:From /users/amtseng/miniconda3/envs/tfmodisco/lib/python3.7/site-packages/tensorflow/python/ops/math_ops.py:2403: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
In [6]:
# Function to generate inputs to model
input_func = data_loading.get_input_func(
    files_spec_path, input_length, profile_length, reference_fasta
)
In [7]:
def import_coords_from_bed(peaks_bed, chrom_set=None, simplebed=False):
    if simplebed:
        table = pd.read_csv(
            peaks_bed, sep="\t", header=None,  # Infer compression
            names=[
                "chrom", "peak_start", "peak_end"
            ]
        )
        if chrom_set is not None:
            table = table[table["chrom"].isin(chrom_set)]
        return table[["chrom", "peak_start", "peak_end"]].values
    else:
        table = pd.read_csv(
            peaks_bed, sep="\t", header=None,  # Infer compression
            names=[
                "chrom", "peak_start", "peak_end", "name", "score",
                "strand", "signal", "pval", "qval", "summit_offset"
            ]
        )
        if chrom_set is not None:
            table = table[table["chrom"].isin(chrom_set)]
        table["summit"] = table["peak_start"] + table["summit_offset"]
        vals = table[["chrom", "summit", "summit"]].values
        vals[:, 2] = vals[:, 2] + 1
        return vals
In [8]:
# Get coordinates for each of the tasks
diff_coords = [
    import_coords_from_bed(path, chrom_set=canon_chroms, simplebed=False)
    for path in diff_peaks_paths
]
In [9]:
# Define model whose output is the embedding
embedding_model = keras.models.Model(
    inputs=model.input,
    outputs=keras.layers.Flatten()(model.get_layer("dil_conv_crop").output)
)
embedding_size = np.prod(model.get_layer("dil_conv_crop").output.shape.as_list()[1:])
In [10]:
def get_embeddings(coords, model):
    embeddings = np.empty((len(coords), embedding_size))
    batch_size = 128
    num_batches = int(np.ceil(len(coords) / batch_size))
    for i in tqdm.notebook.trange(num_batches):
        batch_slice = slice(i * batch_size, (i + 1) * batch_size)
        batch = coords[batch_slice]
        input_seqs, profiles = input_func(batch)
        if not use_controls:
            embeddings[batch_slice] = embedding_model.predict(input_seqs)
        else:
            assert profiles.shape[1] == 2 * num_tasks
            embeddings[batch_slice] = embedding_model.predict([input_seqs, profiles[:, num_tasks:]])
    return embeddings
In [11]:
embs = [
    get_embeddings(coords, model) for coords in diff_coords
]



In [12]:
embeddings = np.empty((sum(len(emb) for emb in embs), embedding_size))
labels = np.empty(sum(len(emb) for emb in embs), dtype=int)
i = 0
for l, emb in enumerate(embs):
    embeddings[i : i + len(emb)] = emb
    labels[i : i + len(emb)] = l
    i += len(emb)
In [13]:
centered = embeddings - np.mean(embeddings, axis=0, keepdims=True)
pca = sklearn.decomposition.PCA(n_components=50)
reduced = pca.fit_transform(centered)

um = umap.UMAP(n_neighbors=50, verbose=True)
trans = um.fit_transform(centered)

label_inds = np.unique(labels)
fig, ax = plt.subplots(figsize=(20, 20))
for label_ind in label_inds:
    mask = labels == label_ind
    trans_subset = trans[mask]
    ax.scatter(trans_subset[:, 0], trans_subset[:, 1], label=diff_peaks_names[label_ind], alpha=0.1)
plt.legend()
plt.show()
UMAP(n_neighbors=50, verbose=True)
Construct fuzzy simplicial set
Mon Oct  5 18:11:57 2020 Finding Nearest Neighbors
Mon Oct  5 18:11:57 2020 Building RP forest with 29 trees
Mon Oct  5 20:01:07 2020 NN descent for 18 iterations
	 0  /  18
	 1  /  18
	 2  /  18
	 3  /  18
	 4  /  18
	 5  /  18
	 6  /  18
	 7  /  18
	 8  /  18
	 9  /  18
	 10  /  18
Tue Oct  6 00:17:20 2020 Finished Nearest Neighbor Search
Tue Oct  6 00:21:28 2020 Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Tue Oct  6 00:30:27 2020 Finished embedding
In [14]:
fig, ax = plt.subplots(figsize=(20, 20))
for label_ind in label_inds:
    mask = labels == label_ind
    trans_subset = trans[mask]
    ax.scatter(trans_subset[:, 0], trans_subset[:, 1], label=diff_peaks_names[label_ind])
plt.legend()
plt.show()
In [15]:
# Save the UMAP results and corresponding coordinates
os.makedirs(os.path.dirname(emb_outfile), exist_ok=True)
with h5py.File(emb_outfile, "w") as f:
    def save_group(group_name, coords, umap_coords):
        group = f.create_group(group_name)
        group.create_dataset("coords_chrom", data=coords[:, 0].astype("S"), compression="gzip")
        group.create_dataset("coords_start", data=coords[:, 1].astype(int), compression="gzip")
        group.create_dataset("coords_end", data=coords[:, 2].astype(int), compression="gzip")
        group.create_dataset("umap_trans", data=umap_coords, compression="gzip")
    for i in range(len(diff_coords)):
        save_group(diff_peaks_names[i], diff_coords[i], trans[labels == i])