In [1]:
from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'

motifs = OrderedDict([
    ("Oct4-Sox2", 'Oct4/m0_p0'),
    # Others
    ("Oct4-Sox2/S", 'Sox2/m0_p0'),
    ("Oct4-Sox2/N", 'Nanog/m0_p0'),
    ("Oct4-Sox2/K", 'Klf4/m0_p1'),
    
    ("Oct4", 'Oct4/m0_p1'),
    ("Oct4-Oct4", 'Oct4/m0_p6'),
    
    ("B-Box", 'Oct4/m0_p5'),
    ("B-Box/S", 'Sox2/m0_p3'),
    ("B-Box/K", 'Klf4/m0_p11'),
    
    ("Sox2", 'Sox2/m0_p1'),
    # Others
    ("Sox2/O", 'Oct4/m0_p3'),
    ("Sox2/N", 'Nanog/m0_p3'),
    ("Sox2/K", 'Klf4/m0_p8'),
    
    ("Nanog", 'Nanog/m0_p1'),
    ("Nanog2", 'Nanog/m0_p4'),
    ("Nanog-mix", 'Nanog/m0_p5'),
    
    # TODO - Other Sox2
    
    ("Zic3", 'Nanog/m0_p2'),
    ("Zic3/K", 'Klf4/m0_p2'),
    
    ("Essrb", 'Oct4/m0_p16'),
    
    ("Klf4", 'Klf4/m0_p0'),
    ("Klf4/O", 'Oct4/m0_p4'),
    ("Klf4/S", 'Sox2/m0_p4'),
    
    ("Klf4-Klf4", 'Klf4/m0_p5'),
])

gpu = 2  # Set to None if GPU shouldn't be used

motifs_inv = {v:k for k,v in motifs.items()}

Goal

  • Visualize the Oct4-distal enhancer region
In [2]:
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *

from basepair.exp.paper.config import *
from basepair.seqmodel import SeqModel
from basepair.BPNet import model2tasks
from basepair.models import seq_bpnet_cropped_extra_seqlen
from basepair.preproc import resize_interval, parse_interval
from basepair.seqmodel import SeqModel
from basepair.utils import unflatten
from genomelake.extractors import FastaExtractor
from concise.preprocessing.sequence import one_hot2string, DNA
from kipoi.utils import unique_list

import pybedtools
from basepair.utils import flatten_list
paper_config()
Using TensorFlow backend.
In [3]:
if gpu is not None:
    create_tf_session(gpu)
In [4]:
# Common paths
model_dir = models_dir / exp
# figures = f"{ddir}/figures/model-evaluation/chipnexus-bpnet/{exp}/known_enhancer_profiles"
figures = Path(f'{ddir}/figures/modisco/{exp}')

!mkdir -p {figures}/known_enhancer_profiles
In [5]:
# Dataspec
ds = DataSpec.load(rdir / 'src/chipnexus/train/seqmodel/ChIP-nexus.dataspec.yml')
In [6]:
from basepair.modisco.results import MultipleModiscoResult
from basepair.modisco.pattern_instances import (multiple_load_instances, load_instances, filter_nonoverlapping_intervals, 
                                                plot_coocurence_matrix, align_instance_center, dfi2seqlets, annotate_profile)

def shorten_te_pattern(s):
    tf, p = s.split("/", 1)
    return tf + "/" + shorten_pattern(p)

mr = MultipleModiscoResult({t: model_dir / f'deeplift/{t}/out/{imp_score}/modisco.h5'
                           for t in tasks})

centroid_seqlet_matches = {t: pd.read_csv(model_dir / f'deeplift/{t}/out/{imp_score}/centroid_seqlet_matches.csv')
                           for t in tasks}

patterns = [p.trim_seq_ic(0.08) for p in mr.get_all_patterns()
            if shorten_te_pattern(p.name) in list(motifs.values())]
TF-MoDISco is using the TensorFlow backend.
In [7]:
bpnet = SeqModel.from_mdir(model_dir)
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
2019-05-08 00:48:10,548 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2019-05-08 00:48:24,255 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
In [20]:
for p in patterns:
    p.plot('seq_ic', height=0.4, letter_width=0.15);
    sns.despine(top=True, bottom=True, right=True)
    plt.title(motifs_inv[shorten_te_pattern(p.name)])
    plt.ylim([0,2])

Get predictions and importance scores

In [8]:
from basepair.exp.paper.locus import *
In [22]:
colors_track_only = []
for task in tasks:
    colors_track_only.append((tf_colors[task], tf_colors[task] + "80"))  # 80 add alpha=0.5
In [9]:
# Generate the right colors
colors = []
for task in tasks:
    colors.append((tf_colors[task], tf_colors[task] + "80"))  # 80 add alpha=0.5
    colors.append(None)
In [11]:
interval = parse_interval("chr17:35503550-35504550")
In [ ]:
# actual coordinates
In [1]:
35503550 + 420
Out[1]:
35503970
In [2]:
35503550 + 575
Out[2]:
35504125
In [12]:
viz_dict, seq, imp_scores = interval_predict(bpnet, ds, interval, tasks, incl_pred=True)
xlim = [420, 575]  # Focus only on the 420 - 575 region
viz_dict = filter_tracks(viz_dict, xlim)

# instances
dfim = get_instances(patterns, seq, imp_scores, imp_score, centroid_seqlet_matches, motifs, tasks).query('match_weighted_p > .2')
seqlets = dfi2seqlets(dfim, motifs_inv)
seqlets2 = [s.shift(-xlim[0]) for s in seqlets]
WARNING:tensorflow:From /users/avsec/workspace/basepair/basepair/heads.py:323: calling softmax (from tensorflow.python.ops.nn_ops) with dim is deprecated and will be removed in a future version.
Instructions for updating:
dim is deprecated, use axis instead
2019-05-04 03:24:17,859 [WARNING] From /users/avsec/workspace/basepair/basepair/heads.py:323: calling softmax (from tensorflow.python.ops.nn_ops) with dim is deprecated and will be removed in a future version.
Instructions for updating:
dim is deprecated, use axis instead
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
In [33]:
print(one_hot2string(seq[:, slice(*xlim)], DNA)[0])  # Get the sequence
GGAGGAACTGGGTGTGGGGAGGTTGTAGCCCGACCCTGCCCCTCCCCCCAGGGAGGTTGAGAGTTCTGGGCAGACGGCAGATGCATAACAAAGGTGCATGATAGCTCTGCCCTGGGGGCAGAGAAGATGGTTGGGGAGGGGTCCCTCTCGTCCTA
In [14]:
viz_dict_pred = OrderedDict([(k,v) for k,v in viz_dict.items() if not k.endswith("Obs")])

Predicted and imp scores

In [15]:
fig = plot_tracks(viz_dict_pred,
                  #seqlets=shifted_seqlets,
                  title="{i.chrom}:{i.start}-{i.end}, {i.name}".format(i=interval),
                  fig_height_per_track=0.5,
                  rotate_y=0,
                  fig_width=get_figsize(frac=1)[0],
                  use_spine_subset=True,
                  seqlets=seqlets2,
                  color=colors,
                  ylim=get_ylim(viz_dict_pred, tasks),
                  legend=False)
sns.despine(top=True, right=True, bottom=True)
fig.savefig(f"{figures}/known_enhancer_profiles/all-motifs/distal_oct4.predicted+importance.pdf")

Predicted

In [18]:
viz_dict_pred_only = OrderedDict([(k,v) for k,v in viz_dict.items() if k.endswith("Pred")])
In [21]:
fig = plot_tracks(viz_dict_pred_only,
                  #seqlets=shifted_seqlets,
                  title="{i.chrom}:{i.start}-{i.end}, {i.name}".format(i=interval),
                  fig_height_per_track=0.5,
                  rotate_y=0,
                  fig_width=get_figsize(frac=1)[0],
                  use_spine_subset=True,
                  seqlets=seqlets2,
                  color=colors_track_only,
                  ylim=get_ylim(viz_dict_pred_only, tasks),
                  legend=False)
sns.despine(top=True, right=True, bottom=True)
fig.savefig(f"{figures}/known_enhancer_profiles/all-motifs/distal_oct4.predicted.pdf")

Importance

In [25]:
viz_dict_imp_only = OrderedDict([(k,v) for k,v in viz_dict.items() if k.endswith("Imp profile")])
In [27]:
fig = plot_tracks(viz_dict_imp_only,
                  #seqlets=shifted_seqlets,
                  title="{i.chrom}:{i.start}-{i.end}, {i.name}".format(i=interval),
                  fig_height_per_track=0.5,
                  rotate_y=0,
                  fig_width=get_figsize(frac=1)[0],
                  use_spine_subset=True,
                  # seqlets=seqlets2,
                  color=None,
                  ylim=get_ylim(viz_dict_imp_only, tasks),
                  legend=False)
sns.despine(top=True, right=True, bottom=True)
fig.savefig(f"{figures}/known_enhancer_profiles/all-motifs/distal_oct4.imp.pdf")

Importance + motif instances

In [28]:
fig = plot_tracks(viz_dict_imp_only,
                  #seqlets=shifted_seqlets,
                  title="{i.chrom}:{i.start}-{i.end}, {i.name}".format(i=interval),
                  fig_height_per_track=0.5,
                  rotate_y=0,
                  fig_width=get_figsize(frac=1)[0],
                  use_spine_subset=True,
                  seqlets=seqlets2,
                  color=None,
                  ylim=get_ylim(viz_dict_imp_only, tasks),
                  legend=False)
sns.despine(top=True, right=True, bottom=True)
fig.savefig(f"{figures}/known_enhancer_profiles/all-motifs/distal_oct4.imp+instances.pdf")

Observed + predicted

In [32]:
viz_dict_obs_pred = OrderedDict([(k,v) for k,v in viz_dict.items() if not k.endswith("Imp profile")])
In [31]:
colors_track_only2 = []
for task in tasks:
    colors_track_only2.append((tf_colors[task], tf_colors[task] + "80"))  # 80 add alpha=0.5
    colors_track_only2.append((tf_colors[task], tf_colors[task] + "80"))  # 80 add alpha=0.5
In [39]:
fig = plot_tracks(viz_dict_obs_pred,
                  #seqlets=shifted_seqlets,
                  title="{i.chrom}:{i.start}-{i.end}, {i.name}".format(i=interval),
                  fig_height_per_track=0.5,
                  rotate_y=0,
                  fig_width=get_figsize(frac=.5)[0],
                  use_spine_subset=True,
                  # seqlets=seqlets2,
                  color=colors_track_only2,
                  ylim=get_ylim(viz_dict_obs_pred, tasks),
                  legend=False)
sns.despine(top=True, right=True, bottom=True)
fig.savefig(f"{figures}/known_enhancer_profiles/all-motifs/distal_oct4.observed+pred.pdf")
In [43]:
ls {figures}/known_enhancer_profiles/all-motifs/
distal_oct4.imp+instances.pdf
distal_oct4.imp.pdf
distal_oct4.observed+pred.pdf
distal_oct4.predicted+importance.pdf
distal_oct4.predicted.pdf
distal_oct4.pred,imp,imp-counts.pdf
Dpp5a uptream,chr9:78,368,500-78,370,278.150bp.pdf
Dpp5a uptream,chr9:78,368,500-78,370,278.1kb.pdf
dppa3 downstream,chr6:122668671-122668700.150bp.pdf
dppa3 downstream,chr6:122668671-122668700.1kb.pdf
Dppa3 upstream,chr6:122623982-122629482.150bp.pdf
Dppa3 upstream,chr6:122623982-122629482.1kb.pdf
Esrrb intragenic 2,chr12:86498650-86505040.150bp.pdf
Esrrb intragenic 2,chr12:86498650-86505040.1kb.pdf
Esrrb intragenic --> 5 smaller regions,chr12:86470750-86505250.150bp.pdf
Esrrb intragenic --> 5 smaller regions,chr12:86470750-86505250.1kb.pdf
Esrrb intragenic,chr12:86470850-86479400.150bp.pdf
Esrrb intragenic,chr12:86470850-86479400.1kb.pdf
Fbxo15,chr18:84933726-84935053.150bp.pdf
Fbxo15,chr18:84933726-84935053.1kb.pdf
Fgf5 POISED enhancer (in ESCs), intragenic,chr5:98283821-98284016.150bp.pdf
Fgf5 POISED enhancer (in ESCs), intragenic,chr5:98283821-98284016.1kb.pdf
Klf2 E1 upstream enhancer,chr8:72311216-72311616.1kb.pdf
Klf2 E1 upstream enhancer (part of superenhancer),chr8:72311184-72311577.150bp.pdf
Klf2 E1 upstream enhancer (part of superenhancer),chr8:72311184-72311577.1kb.pdf
Klf2 E2 upstream enhancer (part of superenhancer),chr8:72316145-72316545.150bp.pdf
Klf2 E2 upstream enhancer (part of superenhancer),chr8:72316145-72316545.1kb.pdf
Klf2 upstream enhancer,chr8:72316001-72316501.150bp.pdf
Klf2 upstream enhancer,chr8:72316001-72316501.1kb.pdf
Klf4 E2 upstream enhancer,chr4:55475488-55475688.1kb.pdf
Klf upstream enhancer,chr4:55474700-55478700.150bp.pdf
Klf upstream enhancer,chr4:55474700-55478700.1kb.pdf
Klf upstream enhancer E1,chr4:55477752-55477893.150bp.pdf
Klf upstream enhancer E1,chr4:55477752-55477893.1kb.pdf
Klf upstream enhancer E2,chr4:55475427-55475489.150bp.pdf
Klf upstream enhancer E2,chr4:55475427-55475489.1kb.pdf
Klf upstream enhancer E3,chr4:55464224-55464356.150bp.pdf
Klf upstream enhancer E3,chr4:55464224-55464356.1kb.pdf
Lefty1 upstream,chr1:180924752-180925152.1kb.pdf
Nanog upstream --> 2 regions,chr6:122702282-122707682.150bp.pdf
Nanog upstream --> 2 regions,chr6:122702282-122707682.1kb.pdf
Nanog upstream,chr6:122701982-122707982.150bp.pdf
Nanog upstream,chr6:122701982-122707982.1kb.pdf
nanog upstream,chr6:122702883-122702904.150bp.pdf
nanog upstream,chr6:122702883-122702904.1kb.pdf
Nr0b1 (Dax1) upstream and intragenic --> 2 regions,chrX:86187061-86193811.150bp.pdf
Nr0b1 (Dax1) upstream and intragenic --> 2 regions,chrX:86187061-86193811.1kb.pdf
Oct4 distal enhancer,chr17:35503913-35504118.150bp.pdf
Oct4 distal enhancer,chr17:35503913-35504118.1kb.pdf
Oct4 distal enhancer,chr17:35504453-35504603.1kb.pdf
Oct4 proximal enhancer,chr17:35504982-35505186.150bp.pdf
Oct4 proximal enhancer,chr17:35504982-35505186.1kb.pdf
Pou5f1 E1 upstream enhancer (part of superenhancer),chr17:35502527-35502964.150bp.pdf
Pou5f1 E1 upstream enhancer (part of superenhancer),chr17:35502527-35502964.1kb.pdf
Pou5f1 E2 upstream enhancer (part of superenhancer),chr17:35503827-35504406.150bp.pdf
Pou5f1 E2 upstream enhancer (part of superenhancer),chr17:35503827-35504406.1kb.pdf
Pou5f1 E3 upstream enhancer (part of superenhancer),chr17:35504846-35505276.150bp.pdf
Pou5f1 E3 upstream enhancer (part of superenhancer),chr17:35504846-35505276.1kb.pdf
Prdm14 E2 upstream enhancer (part of superenhancer),chr1:13074519-13074931.150bp.pdf
Prdm14 E2 upstream enhancer (part of superenhancer),chr1:13074519-13074931.1kb.pdf
Prdm14 E3 upstream enhancer,chr1:13085325-13085725.1kb.pdf
Prdm14 E3 upstream enhancer (part of superenhancer),chr1:13084919-13085299.150bp.pdf
Prdm14 E3 upstream enhancer (part of superenhancer),chr1:13084919-13085299.1kb.pdf
reporter-validated regions 10,chr10:84911039-84911106.150bp.pdf
reporter-validated regions 10,chr10:84911039-84911106.1kb.pdf
reporter-validated regions 11,chr14:77250873-77250920.150bp.pdf
reporter-validated regions 11,chr14:77250873-77250920.1kb.pdf
reporter-validated regions 12,chr5:22573072-22573110.150bp.pdf
reporter-validated regions 12,chr5:22573072-22573110.1kb.pdf
reporter-validated regions 1,chr7:13014354-13014635.150bp.pdf
reporter-validated regions 1,chr7:13014354-13014635.1kb.pdf
reporter-validated regions 2,chr9:49751562-49751768.150bp.pdf
reporter-validated regions 2,chr9:49751562-49751768.1kb.pdf
reporter-validated regions 3,chr8:48752015-48752193.150bp.pdf
reporter-validated regions 3,chr8:48752015-48752193.1kb.pdf
reporter-validated regions 4,chr19:28182843-28183013.150bp.pdf
reporter-validated regions 4,chr19:28182843-28183013.1kb.pdf
reporter-validated regions 5,chr5:123158158-123158309.150bp.pdf
reporter-validated regions 5,chr5:123158158-123158309.1kb.pdf
reporter-validated regions 6,chr4:10932386-10932532.150bp.pdf
reporter-validated regions 6,chr4:10932386-10932532.1kb.pdf
reporter-validated regions 7,chr4:57690950-57691052.150bp.pdf
reporter-validated regions 7,chr4:57690950-57691052.1kb.pdf
reporter-validated regions 8,chr8:89003664-89003762.150bp.pdf
reporter-validated regions 8,chr8:89003664-89003762.1kb.pdf
reporter-validated regions 9,chr18:30121791-30121877.150bp.pdf
reporter-validated regions 9,chr18:30121791-30121877.1kb.pdf
Sall1_as in fig 1,chr8:89,003,534-89,003,870.150bp.pdf
Sall1_as in fig 1,chr8:89,003,534-89,003,870.1kb.pdf
sall1 downstream_1,chr8:89,002,819-89,004,759.150bp.pdf
sall1 downstream_1,chr8:89,002,819-89,004,759.1kb.pdf
sall1 downstream_2,chr8:89,014,827-89,016,499.150bp.pdf
sall1 downstream_2,chr8:89,014,827-89,016,499.1kb.pdf
Sall1 intragenic,chr8:9151000-9156000.150bp.pdf
Sall1 intragenic,chr8:9151000-9156000.1kb.pdf
sall4,chr2:168846556-168867035.150bp.pdf
sall4,chr2:168846556-168867035.1kb.pdf
sequences.150bp.fa
sequences.150bp.new.fa
Sik1 E1 upstream enhancer (part of superenhancer),chr17:31802652-31803103.150bp.pdf
Sik1 E1 upstream enhancer (part of superenhancer),chr17:31802652-31803103.1kb.pdf
Sik1 E2 upstream enhancer (part of superenhancer),chr17:31806454-31806921.150bp.pdf
Sik1 E2 upstream enhancer (part of superenhancer),chr17:31806454-31806921.1kb.pdf
Sik1 E3 upstream enhancer (part of superenhancer),chr17:31808187-31808528.150bp.pdf
Sik1 E3 upstream enhancer (part of superenhancer),chr17:31808187-31808528.1kb.pdf
Sik1 E6 upstream enhancer (part of superenhancer),chr17:31819327-31819731.150bp.pdf
Sik1 E6 upstream enhancer (part of superenhancer),chr17:31819327-31819731.1kb.pdf
Sox2 downstream enhancer (SCR),chr3:34649961-34652521.150bp.pdf
Sox2 downstream enhancer (SCR),chr3:34649961-34652521.1kb.pdf
Sox2 SCR-SRR107 (downstream enhancer),chr3:34757649-34759185.150bp.pdf
Sox2 SCR-SRR107 (downstream enhancer),chr3:34757649-34759185.1kb.pdf
Sox2 SCR-SRR111 (downstream enhancer),chr3:34761023-34762535.150bp.pdf
Sox2 SCR-SRR111 (downstream enhancer),chr3:34761023-34762535.1kb.pdf
Tbx3 intragenic 1,chr5:119679831-119679921.150bp.pdf
Tbx3 intragenic 1,chr5:119679831-119679921.1kb.pdf
Tbx3 intragenic 2,chr5:119683200-119683427.150bp.pdf
Tbx3 intragenic 2,chr5:119683200-119683427.1kb.pdf
Tbx3 intragenic 3,chr5:119679791-119682191.150bp.pdf
Tbx3 intragenic 3,chr5:119679791-119682191.1kb.pdf
Zfp281 downstream enhancer,chr1:136680205-136680605.1kb.pdf
In [41]:
!mkdir -p {figures}/known_enhancer_profiles/all-motifs

Observed and predicted

In [ ]:
from basepair.plot.tracks import *
In [ ]:
def plot_seqlet_underscore(seqlet, ax, add_label=False):
    """
    Args:
      seqlet: object with start, end, name, strand attribues
      if Seqname is available, then we can plot it to the right position
    """
    xlim = ax.get_xlim()

    xmin = seqlet.start + 0.5
    xmax = seqlet.end + 0.5
    if xmax < 0 or xmin > xlim[1] - xlim[0]:
        return

    # TODO - it would be nice to also have some differnet colors here
    draw_hline(xmin, xmax, ax.get_ylim()[0], col='r', linewidth=5, alpha=0.3)

    if add_label:
        y = ax.get_ylim()[1] + (ax.get_ylim()[1] - ax.get_ylim()[0]) * 0.15

        ax.text(xmin + 0.5, y,
                s=seqlet.strand + str(seqlet.name),
                fontsize=5)

Other enhancers

In [39]:
df_enhancers = pd.read_csv("https://docs.google.com/spreadsheets/d/1nIRLv3tWq_3BjorP_pEAyJKWtVClX6_OrE2waN94ECc/export?gid=0&format=csv")
In [40]:
# # New regions to generate
# names = ['sall1 downstream_1',
#         'sall1 downstream_2',
#         'Sall1_as in fig 1',
#         'Dpp5a uptream',
#         'Fbxo15']

# df_enhancers = df_enhancers[df_enhancers.Name.isin(names)]
In [41]:
df_enhancer_intervals = df_enhancers[['Name', 'mm10 coordinates']].dropna()
In [42]:
# assert len(names) == len(df_enhancer_intervals)
In [43]:
intervals = [(row.Name, str(row[['mm10 coordinates']].iloc[0]).strip())
             for i, row in df_enhancer_intervals.iterrows()]
In [ ]:
sequences = dict()
for i, (name, interval_str) in enumerate(intervals):
    print(f"{i}/{len(intervals)}", name)
    interval = parse_interval(interval_str)
    viz_dict, seq, imp_scores = interval_predict(bpnet, ds, interval, tasks)
    dfim = get_instances(patterns, seq, imp_scores, imp_score, centroid_seqlet_matches, motifs, tasks).query('match_weighted_p > .2')
    seqlets = dfi2seqlets(dfim, motifs_inv)
    
    xlim = None
    fig = plot_tracks(viz_dict,
                      #seqlets=shifted_seqlets,
                      title=str_interval(interval, xlim) + name,
                      fig_height_per_track=0.5,
                      rotate_y=0,
                      fig_width=get_figsize(frac=2)[0],
                      use_spine_subset=True,
                      seqlets=seqlets,
                      color=colors,
                      ylim=get_ylim(viz_dict, tasks, True),
                      # seqlet_plot_fn=plot_seqlet_underscore,
                      legend=False)
    sns.despine(top=True, right=True, bottom=True)
    fig.savefig(f"{figures}/known_enhancer_profiles/all-motifs/{name},{interval_str}.1kb.pdf")
    plt.close()
    
    # Figure out the most interesting 150 bp in the entire 1kb region
    # by looking at the total number of counts in the 150 bp window
    from basepair.preproc import moving_average
    # center = np.argmax(moving_average(sum([np.abs(viz_dict[f'{task} Obs']).sum(axis=-1) for task in tasks]), 150))
    center = np.argmax(moving_average(sum([np.abs(viz_dict[f'{task} Imp profile']).sum(axis=-1) for task in tasks]), 150))
    xlim = [center - 75, center + 75]
    seqlets2 = [s.shift(-xlim[0]) for s in seqlets]
    viz_dict2 = filter_tracks(viz_dict, xlim)

    print(str_interval(interval, xlim), name)
    seq_str = one_hot2string(seq[:, slice(*xlim)], DNA)[0]
    print(seq_str)  # Get the sequence
    sequences[f"{name},{interval_str}"] = seq_str
    fig = plot_tracks(viz_dict2,
                      #seqlets=shifted_seqlets,
                      title=str_interval(interval, xlim) + " (" + str(xlim) + ")",
                      fig_height_per_track=0.5,
                      rotate_y=0,
                      fig_width=get_figsize(frac=1)[0],
                      use_spine_subset=True,
                      seqlets=seqlets2,
                      color=colors,
                      ylim=get_ylim(viz_dict2, tasks, True),
                      legend=False)
    sns.despine(top=True, right=True, bottom=True)
    fig.savefig(f"{figures}/known_enhancer_profiles/all-motifs/{name},{interval_str}.150bp.pdf")
    plt.close()
    
# Write out the fasta file
from concise.utils.fasta import write_fasta
write_fasta(f"{figures}/known_enhancer_profiles/all-motifs/sequences.150bp.new.fa", list(sequences.values()), list(sequences))
0/53 Klf upstream enhancer
resizing the interval of length 4000 to 1000
chr4:55475174-55475324, . Klf upstream enhancer
GGCTATCCATTCCTCTATCAACTCATCCATTCATCCATCCATAAATCAATCCTCATTCACTTGCCTATTTCTCCACCCAGTCACTGATTCATCTATGCATCTATCCATTTATCCACCCATCACTCACCCACCCATCTACCCACTCATTTA
1/53 Klf upstream enhancer E1
resizing the interval of length 141 to 1000
chr4:55477925-55478075, . Klf upstream enhancer E1
GGCCCCACTCCCACATCCTATCATACACATTGAAATTCACCCACTTTGCATATCAAATGAGTTATATATAGCTAACTGGGGAGGCCAGTTGCAAAGACAGTTGACATAATGTTACCTTTTGTAGACATTTAATTACACACTCATCAATTT
2/53 Klf upstream enhancer E2
resizing the interval of length 62 to 1000
chr4:55475915-55476065, . Klf upstream enhancer E2
GGTCATCCTTGCCTGTTCAATCCTGAGCAACCTTCCTGGCAAAGGATGGACTATAACCTGCTAACCTTGAATACATTATTATAAGGGAAAGCAGCTGACCAACAAAGTGTGGGGGGTTTCTCCATTTATATGAAGATCAGAGTCGTTAAT
3/53 Klf upstream enhancer E3
resizing the interval of length 132 to 1000
chr4:55464818-55464968, . Klf upstream enhancer E3
TGAGCACTTGATTCTTAGTTCACTTTCTCAGCTTGTTCCCTATTCTTCCGTACCCTTTAAAGTCCTCCCAGCTTTTGAATACTAATCACATTCACTAGTCCAAAGTCCTTTGTCATTTTACGCGCTCCCTAGAATAGTCAATTGCATAAA
4/53 Sox2 downstream enhancer (SCR)
resizing the interval of length 2560 to 1000
chr3:34650425-34650575, . Sox2 downstream enhancer (SCR)
GGGCGCCCTGCCAGGCCGGGGACCTCCGGGACATGATCAGCATGTACCTCCCCGGCGCCGAGGTGCCGGAGCCCGCTGCGCCCAGTAGACTGCACATGGCCCAGCACTACCAGAGCGGCCCGGTGCCCGGCACGGCCATTAACGGCACAC
5/53 Sox2 SCR-SRR107 (downstream enhancer)
resizing the interval of length 1536 to 1000
chr3:34757720-34757870, . Sox2 SCR-SRR107 (downstream enhancer)
AAGGGACTCAAATGACACCTCCCTCCCGGCTGTTACCCAGGAAGTTGTTTAGCATAGTCCCAGGACTCTGCTAAGGGCAAGTACTGTCATTGTTATTTATTTAGCCCCATTATGCTAACTACCCACCCCTTGCCAAGGTTTCACCACCCA
6/53 Sox2 SCR-SRR111 (downstream enhancer)
resizing the interval of length 1512 to 1000
chr3:34761145-34761295, . Sox2 SCR-SRR111 (downstream enhancer)
AATAGAAGAATTTAAGAATGACTCAAATGGAAGGTGGAGGACAATTAGGGTTTAAAAAAAGAACCTGGGATGGGCCAGTTGTAAACCCCCTGGAGCTGCCTAGAGGAAGGAGCTGGAGGAGAGCTTAGAAAACAAAGGGGGAGGTCATGG
7/53 Tbx3 intragenic 1
resizing the interval of length 90 to 1000
chr5:119680487-119680637, . Tbx3 intragenic 1
TAGAATTATTACAATAAGAAAAGCTGCCAGTTGGGGGCCTGGCTGGAGAAGGTTGAACACTTGTGATTCCTTCTGATAGTGTAAAGTGTATCAACATCACCATTATCATCGTCCCATTTTACAGACTGGGAAAATGAAGTCTGGGAGTGG
8/53 Fgf5 POISED enhancer (in ESCs), intragenic
resizing the interval of length 195 to 1000
chr5:98284444-98284594, . Fgf5 POISED enhancer (in ESCs), intragenic
CCCCCTCCCTGGGGGTGTGCTCAGAGGACCACCCTGCAAGTAGCTGGCTCTGTGGTCACAGTTTCATGCCGTGGGCGGTGAGTGGGTGTGGAGCCGGAGGCAGGATGTGCCGGCAGGATCCAATGCGGTTAAAGTTTGGGATCGCTAGTC
9/53 Tbx3 intragenic 2
resizing the interval of length 227 to 1000
chr5:119683583-119683733, . Tbx3 intragenic 2
ACATGGGGTGACCCCCGCCCTGTCCCTTTCAGTTTTGTCTGGGAGGGAGCACTAAAGCCTTTCCAGAGACTATGCTAGATACCACAGGCCCTGCCACCATGGGCTGTGTAACCAGGCTGCTGTTGCTTTGAAACGCGGGACTGAGGGGGC
10/53 Klf2 upstream enhancer
resizing the interval of length 500 to 1000
chr8:72316417-72316567, . Klf2 upstream enhancer
CAAAGCCAACCAGAAGCAAACAAAACAACCCATTTGAAACTATTCAAAAGACTGAGGCCTGCACAAAGGGCTTAGAGGAGCTAAAAGAAAAGATATGCAGAAAGAGAAAAAACGCATTTCAAAAGCTCCCCCTGGCCATAGGTGTGGTGC
11/53 Esrrb intragenic
resizing the interval of length 8550 to 1000
chr12:86471225-86471375, . Esrrb intragenic
GATGGGGGTCTGATTGAATTTGCTAGCATATCTAAAGGAGGGTTTGTGACTGTCCAGGACGATTAGCCTGCCCTTGGAGCAGAGAGACTGGCAAAGGTGCCCTCCTGCCCAGGGAAAGGGAAGCTGAAGCTAGGGGCAGCCTGGGTGTGA
12/53 Esrrb intragenic 2
resizing the interval of length 6390 to 1000
chr12:86499056-86499206, . Esrrb intragenic 2
CCAAGGGCTGGCATTACCGGCTGGTATCACCTGATTTACGAGGTTGCTTTCTTTTGCTGGTGGTATTCAACTGCAAAGTTGAGCTATCAAGTCATTGGCAAAGAGGACAAAGCCTTTGGATTTGGAGGTGGCCCACGAATTCCACAGATC
13/53 Nanog upstream --> 2 regions
resizing the interval of length 5400 to 1000
chr6:122703100-122703250, . Nanog upstream --> 2 regions
GGGAGGAGGGCAAGATGCCGCGCTCTGCGTTTCTCCAGCTGGATGTGGACAGAGGTCAACCAGCCACATTAGTTTATGTCTAAAAGTGTCTAATTGAAACAAGAAATGGCTGCTTTAGCCGGGTGTGGTGGCACACGCCTTTAATCCCAG
14/53 Nr0b1 (Dax1) upstream and intragenic --> 2 regions
resizing the interval of length 6750 to 1000

Specific scale

Other enhancers

In [1]:
# intervals = [
#     ("Klf2 E1 upstream enhancer", "chr8:72311216-72311616"),
#     ("Klf4 E2 upstream enhancer", "chr4:55475488-55475688"),
#     ("Prdm14 E3 upstream enhancer", "chr1:13084919-13085299"),
#     ("Zfp281 downstream enhancer", "chr1:136680205-136680605"),
#     ("Lefty1 upstream", "chr1:180924752-180925152"),
#     ("Oct4 distal enhancer", "chr17:35504453-35504603")
# ]
In [10]:
df_enhancers = pd.read_csv("https://docs.google.com/spreadsheets/d/1nIRLv3tWq_3BjorP_pEAyJKWtVClX6_OrE2waN94ECc/export?gid=0&format=csv")
df_enhancer_intervals = df_enhancers[['Name', 'mm10 coordinates']].dropna()
intervals = [(row.Name, str(row[['mm10 coordinates']].iloc[0]).strip())
             for i, row in df_enhancer_intervals.iterrows()]
In [11]:
intervals = [
    ('Nanog upstream enhancer', 'chr6:122707295-122,707,721'),
    ('Fbxo15 enhancer', 'chr18:84934293-84934692'),
    ('Nr0b1 enhancer', 'chrX:86187475-86187504')
]
In [10]:
intervals = [('Nanog_EMSA_1', 'chr13:3712945-3712985'),
            ('Nanog_EMSA_2', 'chr19:21785406-21785446'),
            ('Nanog_EMSA_3', 'chr4:40856724-40856764'),
            ('Nanog_EMSA_4', 'chr6:112885434-112885474'),
            ('Nanog_EMSA_5', 'chr5:142415570-142415610'),
            ('Tbx3_distal_1', 'chr5:119579664-119580351'),
            ('Tbx3_distal_2', 'chr5:119579408-119579566'),
            ('Tbx3_distal_3', 'chr5:119584867-119585462'),
            ('dsp_distal', 'chr13:38109043-38109501'),
            ('dsp_proximal', 'chr13:38123730-38124166'),
            ('cdh1_inragenic', 'chr8:106609099-106609356')
 ]
In [11]:
!mkdir -p {figures}/known_enhancer_profiles/all-motifs-individual-y-scale-profile
In [12]:
# Generate the right colors
colors = []
for task in tasks:
    colors.append((tf_colors[task], tf_colors[task] + "80"))  # 80 add alpha=0.5
    colors.append(None)
In [13]:
sequences = dict()
for i, (name, interval_str) in enumerate(intervals):
    print(f"{i}/{len(intervals)}", name)
    interval = parse_interval(interval_str)
    interval = resize_interval(interval, 1000)
    viz_dict, seq, imp_scores = interval_predict(bpnet, ds, interval, tasks)
    dfim = get_instances(patterns, seq, imp_scores, imp_score, centroid_seqlet_matches, motifs, tasks).query('match_weighted_p > .2')
    seqlets = dfi2seqlets(dfim, motifs_inv)
    
    xlim = None
    fig = plot_tracks(viz_dict,
                      #seqlets=shifted_seqlets,
                      title=str_interval(interval, xlim) + name,
                      fig_height_per_track=0.5,
                      rotate_y=0,
                      fig_width=get_figsize(frac=2)[0],
                      use_spine_subset=True,
                      seqlets=seqlets,
                      color=colors,
                      ylim=get_ylim(viz_dict, tasks, True),
                      # seqlet_plot_fn=plot_seqlet_underscore,
                      legend=False)
    sns.despine(top=True, right=True, bottom=True)
    fig.savefig(f"{figures}/known_enhancer_profiles/all-motifs-individual-y-scale-profile/{name},{interval_str}.1kb.pdf")
    plt.close()
    
    # Figure out the most interesting 150 bp in the entire 1kb region
    # by looking at the total number of counts in the 150 bp window
    from basepair.preproc import moving_average
    # center = np.argmax(moving_average(sum([np.abs(viz_dict[f'{task} Obs']).sum(axis=-1) for task in tasks]), 150))
    center = np.argmax(moving_average(sum([np.abs(viz_dict[f'{task} Imp profile']).sum(axis=-1) for task in tasks]), 150))
    center = min(max(center, 75), 925)
    xlim = [center - 75, center + 75]
    seqlets2 = [s.shift(-xlim[0]) for s in seqlets]
    viz_dict2 = filter_tracks(viz_dict, xlim)

    print(str_interval(interval, xlim), name)
    seq_str = one_hot2string(seq[:, slice(*xlim)], DNA)[0]
    print(seq_str)  # Get the sequence
    sequences[f"{name},{interval_str}"] = seq_str
    fig = plot_tracks(viz_dict2,
                      #seqlets=shifted_seqlets,
                      title=str_interval(interval, xlim) + " (" + str(xlim) + ")",
                      fig_height_per_track=0.5,
                      rotate_y=0,
                      fig_width=get_figsize(frac=1)[0],
                      use_spine_subset=True,
                      seqlets=seqlets2,
                      color=colors,
                      ylim=get_ylim(viz_dict2, tasks, True),
                      legend=False)
    sns.despine(top=True, right=True, bottom=True)
    fig.savefig(f"{figures}/known_enhancer_profiles/all-motifs-individual-y-scale-profile/{name},{interval_str}.150bp.pdf")
    plt.close()
    
# Write out the fasta file
from concise.utils.fasta import write_fasta
write_fasta(f"{figures}/known_enhancer_profiles/all-motifs-individual-y-scale-profile/sequences.150bp.new.fa", list(sequences.values()), list(sequences))    
0/11 Nanog_EMSA_1
WARNING:tensorflow:From /users/avsec/workspace/basepair/basepair/heads.py:323: calling softmax (from tensorflow.python.ops.nn_ops) with dim is deprecated and will be removed in a future version.
Instructions for updating:
dim is deprecated, use axis instead
2019-05-08 00:51:48,480 [WARNING] From /users/avsec/workspace/basepair/basepair/heads.py:323: calling softmax (from tensorflow.python.ops.nn_ops) with dim is deprecated and will be removed in a future version.
Instructions for updating:
dim is deprecated, use axis instead
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
chr13:3712892-3713042, . Nanog_EMSA_1
AATCACTGTGGGAGGCCAACTCAGAGTAAGTGAGTAGGCTGGGAAAGCCATTTTCCTGCAACCAGCCCTTGATGGCCCTCCTTGATGGCCCGCTGAGTCTGTGCCTGGAAAGCTGACTTTAGATCCCTGGAGAGCTCCTTGATGTGCAGA
1/11 Nanog_EMSA_2
chr19:21785392-21785542, . Nanog_EMSA_2
TTCCTAACTCTGATGGATTCCTTTCAGCTCTGATGGGTTTCTTTCAGCTATTGAGGACTTTAGCGTCTGCCACCCACTGCTTCTTGTGGACTTGGTTAGCAGAGTTGAAAGGGAAGCACTCTTCAGAATATTTACTGCCAGCCATATGCC
2/11 Nanog_EMSA_3
chr4:40856682-40856832, . Nanog_EMSA_3
GGGCGGGGGCGGGGGGGGGGAATAACACCTGCTGATTGTCTGTTGATTGGGATGACAAAGAACCCATCAAGGACTAAGCCTTTTTTATGCCTGCCAGTTTACATTGCTGTCACACCCCAGTAGATGAGGAGAACAGAGGACAGTACCTGG
3/11 Nanog_EMSA_4
chr6:112885405-112885555, . Nanog_EMSA_4
CAGCACATTTTCCTCCCTTTGATGGATGATCACTTAATTCCTCCTTGATTGCTTTTCAAAAGCAATGTAAATTAATGCAAGCCCACACTGGGAGCCCACAAAGGGCAAGGCTCTCTGTGAATCATTAGTGCATGATAATAAATGCAAATA
4/11 Nanog_EMSA_5
chr5:142415560-142415710, . Nanog_EMSA_5
TATTGCAGAGCAAACCATCCGGACTAATGGGCCATCAGGGATAAATCACAGGACCAAGGATGCGATTTACATAACGGTTAAACCAGTTTAGTGTATCCCGAGAAGGGAGCCAGCTCCTCCCCAAGCGATTATTCAGCGTTATTAATCTGC
5/11 Tbx3_distal_1
chr5:119579752-119579902, . Tbx3_distal_1
AAGTGCAAAGCCCAGTTGTTACAAAGAGTACTTGATGGTAGCAACACTGGTTGGTTAAAAGCCATCAAGCCATAAATTGAGCCATCTTGAGACAGACTGGGTTCAAAATCTTCAGTTGCTATTGTTAGAAATTTCAGTCAGGTTGCCACT
6/11 Tbx3_distal_2
chr5:119579737-119579887, . Tbx3_distal_2
CCAGCCTGGGCTATAAAGTGCAAAGCCCAGTTGTTACAAAGAGTACTTGATGGTAGCAACACTGGTTGGTTAAAAGCCATCAAGCCATAAATTGAGCCATCTTGAGACAGACTGGGTTCAAAATCTTCAGTTGCTATTGTTAGAAATTTC
7/11 Tbx3_distal_3
chr5:119585138-119585288, . Tbx3_distal_3
ATCCTGGAATTTGTTGTTTAAAATAAGAAAAGAAAGTAGATTGGAGTTTCCCGGACTTAACTCTTTTTGGAGAAATCACTTTATCTTTTTTAATTGTCCCGTTTTTATCTCCACCCCTCCAGTTCCCAAAATTAGCAATAGATATGCTAA
8/11 dsp_distal
chr13:38109216-38109366, . dsp_distal
ACCCATTGTTCCCAGCGGGAAGGTGGGATGTGGCCCTCAGCATACAGCAGCAGGTGCAGGAGCTGGGAGCCAGCGTCAGGCACACCCCAGTCTCTAGCTTTTGTTCTGTAGCCACCACCCTAGCTGAGGCTATTGAAATAGCCTTTTGTC
9/11 dsp_proximal
chr13:38123944-38124094, . dsp_proximal
TGCCACACCTTCTGAATGTCAGGTGAAGAATTTGCCTAGACCAAGCACCCTTGATCAAGTCAGAACAGGTTTCCTAAAGACCGGGCCCATTAACATTAATGGACTCAAAATAGGCCAATCTCTTCTTAAATGTGGCTTCTTTTTGTTCTG
10/11 cdh1_inragenic
chr8:106609196-106609346, . cdh1_inragenic
CATAAATATGGGAGGGGGGTGGAATGCATGAATGTAAAGAATGCTGGCATAAGTAGGACCAGTGAGTTGGGTGTGGCAGTGCACCTTGGAGACAGAGCCAGGCGGACCTCTGAGTTCAAGGACAGTCATCAGGGCTCCACAGAGAAATCC