Plot tracks abve the signal

In [ ]:
# TODO - list files to use
In [1]:
import pandas as pd
import numpy as np
from pybedtools import BedTool
from basepair.config import get_data_dir, create_tf_session
from tqdm import tqdm
from concise.preprocessing import encodeDNA
from basepair.datasets import get_sox2_data
from basepair.plots import plot_loss
import pyBigWig
from basepair.math import softmax
import matplotlib.pyplot as plt

from basepair.datasets import *
from basepair.datasets import sox2_oct4_peaks_sox2
from basepair import samplers
from concise.utils.plot import seqlogo_fig, seqlogo
from imp import reload
import h5py
from modisco.visualization import viz_sequence
from matplotlib import pyplot as plt

ddir = get_data_dir()
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
In [2]:
train, valid, test = sox2_oct4_peaks_sox2()
In [3]:
modisco_file = f"{ddir}/processed/chipnexus/motifs/sox2-oct4/modisco/counts.hdf5"
In [4]:
!ls {ddir}/processed/chipnexus/motifs/*/modisco/*
/users/avsec/workspace/basepair/basepair/../data/processed/chipnexus/motifs/oct4/modisco/counts.hdf5
/users/avsec/workspace/basepair/basepair/../data/processed/chipnexus/motifs/oct4/modisco/defaults.hdf5
/users/avsec/workspace/basepair/basepair/../data/processed/chipnexus/motifs/oct4/modisco/multi-task.hdf5
/users/avsec/workspace/basepair/basepair/../data/processed/chipnexus/motifs/oct4/modisco/valid_include.npy
/users/avsec/workspace/basepair/basepair/../data/processed/chipnexus/motifs/sox2/modisco/counts.hdf5
/users/avsec/workspace/basepair/basepair/../data/processed/chipnexus/motifs/sox2/modisco/defaults.hdf5
/users/avsec/workspace/basepair/basepair/../data/processed/chipnexus/motifs/sox2/modisco/multi-task.hdf5
/users/avsec/workspace/basepair/basepair/../data/processed/chipnexus/motifs/sox2/modisco/valid_include.npy
/users/avsec/workspace/basepair/basepair/../data/processed/chipnexus/motifs/sox2-oct4/modisco/counts.hdf5
In [5]:
modisco_file = f"{ddir}/processed/chipnexus/motifs/oct4/modisco/counts.hdf5"
In [6]:
from kipoi.readers import HDF5Reader
In [7]:
f = HDF5Reader(modisco_file)
In [8]:
f.open()
In [9]:
f.f['/metacluster_idx_to_submetacluster_results/metacluster0/seqlets'][:][0]
Out[9]:
b'example:0,start:53,end:94,rc:False'
In [10]:
f.ls()
Out[10]:
[('/metacluster_idx_to_submetacluster_results/metacluster0/activity_pattern',
  <HDF5 dataset "activity_pattern": shape (1,), type "<i8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets',
  <HDF5 dataset "seqlets": shape (5433,), type "|O">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/affmat',
  <HDF5 dataset "affmat": shape (2171, 2171), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/cluster_results/cluster_indices',
  <HDF5 dataset "cluster_indices": shape (2171,), type "<i8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/all_pattern_names',
  <HDF5 dataset "all_pattern_names": shape (4,), type "|O">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/seqlets_and_alnmts/alnmts',
  <HDF5 dataset "alnmts": shape (889,), type "<i8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/seqlets_and_alnmts/seqlets',
  <HDF5 dataset "seqlets": shape (889,), type "|O">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/sequence/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/sequence/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/task0_contrib_scores/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/task0_contrib_scores/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/task0_hypothetical_contribs/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/task0_hypothetical_contribs/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/seqlets_and_alnmts/alnmts',
  <HDF5 dataset "alnmts": shape (179,), type "<i8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/seqlets_and_alnmts/seqlets',
  <HDF5 dataset "seqlets": shape (179,), type "|O">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/sequence/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/sequence/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/task0_contrib_scores/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/task0_contrib_scores/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/task0_hypothetical_contribs/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/task0_hypothetical_contribs/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_2/seqlets_and_alnmts/alnmts',
  <HDF5 dataset "alnmts": shape (160,), type "<i8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_2/seqlets_and_alnmts/seqlets',
  <HDF5 dataset "seqlets": shape (160,), type "|O">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_2/sequence/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_2/sequence/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_2/task0_contrib_scores/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_2/task0_contrib_scores/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_2/task0_hypothetical_contribs/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_2/task0_hypothetical_contribs/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_3/seqlets_and_alnmts/alnmts',
  <HDF5 dataset "alnmts": shape (40,), type "<i8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_3/seqlets_and_alnmts/seqlets',
  <HDF5 dataset "seqlets": shape (40,), type "|O">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_3/sequence/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_3/sequence/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_3/task0_contrib_scores/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_3/task0_contrib_scores/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_3/task0_hypothetical_contribs/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_3/task0_hypothetical_contribs/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metaclustering_results/all_metacluster_names',
  <HDF5 dataset "all_metacluster_names": shape (1,), type "|O">),
 ('/metaclustering_results/metacluster_indices',
  <HDF5 dataset "metacluster_indices": shape (5453,), type "<i8">),
 ('/multitask_seqlet_creation_results/final_seqlets',
  <HDF5 dataset "final_seqlets": shape (5453,), type "|O">),
 ('/multitask_seqlet_creation_results/task_name_to_coord_producer_results/task0/coords',
  <HDF5 dataset "coords": shape (5453,), type "|O">),
 ('/multitask_seqlet_creation_results/task_name_to_coord_producer_results/task0/thresholding_results/densities',
  <HDF5 dataset "densities": shape (100,), type "<f8">),
 ('/multitask_seqlet_creation_results/task_name_to_coord_producer_results/task0/vals_to_threshold',
  <HDF5 dataset "vals_to_threshold": shape (11050,), type "<f8">),
 ('/task_names', <HDF5 dataset "task_names": shape (1,), type "|O">)]
In [11]:
import yaml
In [12]:
f.close()
In [36]:
from basepair.modisco import nan_like, parse_seqlet, ModiscoResult
In [14]:
from concise.utils.pwm import PWM
In [37]:
mr = ModiscoResult(modisco_file)
In [16]:
r = mr.extract_signal(valid[0])
In [17]:
def ic_scale(x):
    background = np.array([0.27, 0.23, 0.23, 0.27])
    return viz_sequence.ic_scale(x, background=background)
In [18]:
seqlogo_fig(ic_scale(r['pattern_1'].mean(axis=0)), figsize=(10,2));
In [19]:
# TODO - extract the signal
In [20]:
signal = mr.extract_signal(valid[1]['sox2'])['pattern_0'].mean(axis=0)
sequence = ic_scale(mr.extract_signal(valid[0])['pattern_0'].mean(axis=0))
In [21]:
# TODO - assemble figures into one?
In [22]:
def plot_profiles(x, y, mresult):
    for pattern in mr.patterns():
        signal = mr.extract_signal(y)[pattern].mean(axis=0)
        sequence = ic_scale(mr.extract_signal(x)[pattern].mean(axis=0))

        fig, (ax0, ax1) = plt.subplots(2, 1, sharex=True, figsize=(20,2))
        ax0.set_title(f"{pattern}")
        ax0.plot(signal[:,0], label='fwd')
        ax0.plot(signal[:,1], label='rev')
        ax0.legend()
        seqlogo(sequence, ax=ax1)
In [23]:
mr.close()
In [ ]:
from basepair.modisco import ModiscoResult

Sox2 - signal

In [2]:
fpath = f"{ddir}/processed/chipnexus/motifs/sox2/modisco/multi-task.0.valid.h5"
mr = ModiscoResult(fpath)
incl = np.load(fpath+".valid.npy")
mr.plot_profiles(valid[0][incl], {"Sox2": valid[1]['sox2'][incl],
                                  "Oct4": valid[1]['oct4'][incl]},
                rc_vec=[False, True],
                start_vec = [15, 15],
                width=40,
                legend=False,
                ylim=[0, 7.45],
                seq_height=1.5,
                fpath_template='fig/icml18/motifs/sox2-signal.motif_{}',
                figsize=(6,2.5))
mr.close()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-8f54f4b64770> in <module>()
----> 1 fpath = f"{ddir}/processed/chipnexus/motifs/sox2/modisco/multi-task.0.valid.h5"
      2 mr = ModiscoResult(fpath)
      3 incl = np.load(fpath+".valid.npy")
      4 mr.plot_profiles(valid[0][incl], {"Sox2": valid[1]['sox2'][incl],
      5                                   "Oct4": valid[1]['oct4'][incl]},

NameError: name 'ddir' is not defined

Oct4 - signal

In [228]:
fpath = f"{ddir}/processed/chipnexus/motifs/oct4/modisco/multi-task.1.valid.h5"
mr = ModiscoResult(fpath)

df_oct4 = pd.DataFrame.from_dict(mr.seqlets()['pattern_0'])
df_oct4['id'] = np.arange(len(df_oct4))
df_oct4['chr'] = df_oct4.example.astype(str)
In [229]:
fpath = f"{ddir}/processed/chipnexus/motifs/sox2/modisco/multi-task.0.valid.h5"
mr = ModiscoResult(fpath)

df_sox2 = pd.DataFrame.from_dict(mr.seqlets()['pattern_1'])
df_sox2['id'] = np.arange(len(df_sox2))
df_sox2['chr'] = df_sox2.example.astype(str)
In [230]:
from pybedtools import BedTool
In [234]:
int_oct4 = BedTool.from_dataframe(df_oct4[['chr', 'start', 'end']])
In [236]:
int_sox2 = BedTool.from_dataframe(df_sox2[['chr', 'start', 'end']])
In [238]:
intersect =int_oct4.intersect(int_sox2, wa=True,  u=True)
In [240]:
intersect_b =int_sox2.intersect(int_oct4, wa=True,  u=True)
In [239]:
len(intersect)
Out[239]:
89
In [242]:
len(int_sox2)
Out[242]:
295
In [243]:
len(int_oct4)
Out[243]:
340
In [241]:
len(intersect_b)
Out[241]:
80
In [244]:
# Hence ~80/300  intervals overlap
In [246]:
80/300
Out[246]:
0.26666666666666666
In [218]:
m = pd.merge(df_sox2, df_oct4, on='example', how='outer')
In [227]:
df_sox2['chr'] = df_sox2.example.astype(str)
Out[227]:
end example rc start id
0 186 382 False 116 0
1 140 548 False 70 1
2 108 523 False 38 2
3 189 80 False 119 3
4 119 479 False 49 4
5 154 610 False 84 5
6 71 73 False 1 6
7 105 157 False 35 7
8 114 57 False 44 8
9 130 163 False 60 9
10 130 683 False 60 10
11 84 568 False 14 11
12 109 706 False 39 12
13 161 262 False 91 13
14 124 504 False 54 14
15 72 774 False 2 15
16 120 110 False 50 16
17 155 328 False 85 17
18 155 607 False 85 18
19 110 204 False 40 19
20 164 724 False 94 20
21 148 48 True 78 21
22 76 345 False 6 22
23 76 564 False 6 23
24 109 602 False 39 24
25 186 671 False 116 25
26 195 498 False 125 26
27 156 507 False 86 27
28 88 384 False 18 28
29 104 240 False 34 29
... ... ... ... ... ...
265 170 643 True 100 265
266 193 509 True 123 266
267 102 13 True 32 267
268 165 9 True 95 268
269 174 576 True 104 269
270 147 653 True 77 270
271 177 625 False 107 271
272 94 736 True 24 272
273 115 179 False 45 273
274 102 770 True 32 274
275 128 175 False 58 275
276 155 227 False 85 276
277 107 244 False 37 277
278 130 398 False 60 278
279 158 591 False 88 279
280 76 25 False 6 280
281 124 593 False 54 281
282 140 304 False 70 282
283 146 614 False 76 283
284 88 256 False 18 284
285 166 456 False 96 285
286 188 549 False 118 286
287 173 646 False 103 287
288 134 285 False 64 288
289 157 385 False 87 289
290 171 202 False 101 290
291 171 412 False 101 291
292 190 580 False 120 292
293 185 609 False 115 293
294 118 585 False 48 294

295 rows × 5 columns

In [224]:
new_df = m.loc[(m["start_x"] >= m["start_y"]) & (m["end_x"] <= m["end_y"])]
In [226]:
new_df
Out[226]:
end_x example rc_x start_x id_x end_y rc_y start_y id_y
238 170.0 61 True 100.0 224.0 170.0 False 100.0 133.0
In [222]:
(m.start_y>m.start_x)*()
Out[222]:
end_x example rc_x start_x id_x end_y rc_y start_y id_y
0 186.0 382 False 116.0 0.0 139.0 True 69.0 73.0
1 140.0 548 False 70.0 1.0 NaN NaN NaN NaN
2 108.0 523 False 38.0 2.0 NaN NaN NaN NaN
3 189.0 80 False 119.0 3.0 179.0 False 109.0 44.0
4 119.0 479 False 49.0 4.0 NaN NaN NaN NaN
5 154.0 610 False 84.0 5.0 NaN NaN NaN NaN
6 71.0 73 False 1.0 6.0 175.0 False 105.0 302.0
7 105.0 157 False 35.0 7.0 190.0 False 120.0 252.0
8 114.0 57 False 44.0 8.0 107.0 False 37.0 169.0
9 130.0 163 False 60.0 9.0 179.0 False 109.0 152.0
10 130.0 683 False 60.0 10.0 NaN NaN NaN NaN
11 84.0 568 False 14.0 11.0 NaN NaN NaN NaN
12 109.0 706 False 39.0 12.0 NaN NaN NaN NaN
13 161.0 262 False 91.0 13.0 NaN NaN NaN NaN
14 124.0 504 False 54.0 14.0 NaN NaN NaN NaN
15 72.0 774 False 2.0 15.0 NaN NaN NaN NaN
16 120.0 110 False 50.0 16.0 NaN NaN NaN NaN
17 155.0 328 False 85.0 17.0 138.0 True 68.0 83.0
18 155.0 328 False 85.0 17.0 97.0 False 27.0 161.0
19 155.0 607 False 85.0 18.0 NaN NaN NaN NaN
20 110.0 204 False 40.0 19.0 NaN NaN NaN NaN
21 164.0 724 False 94.0 20.0 NaN NaN NaN NaN
22 148.0 48 True 78.0 21.0 87.0 False 17.0 118.0
23 76.0 345 False 6.0 22.0 125.0 False 55.0 207.0
24 76.0 564 False 6.0 23.0 NaN NaN NaN NaN
25 109.0 602 False 39.0 24.0 NaN NaN NaN NaN
26 186.0 671 False 116.0 25.0 NaN NaN NaN NaN
27 195.0 498 False 125.0 26.0 NaN NaN NaN NaN
28 156.0 507 False 86.0 27.0 NaN NaN NaN NaN
29 88.0 384 False 18.0 28.0 90.0 False 20.0 149.0
... ... ... ... ... ... ... ... ... ...
513 NaN 45 NaN NaN NaN 151.0 False 81.0 281.0
514 NaN 347 NaN NaN NaN 91.0 True 21.0 282.0
515 NaN 66 NaN NaN NaN 101.0 False 31.0 283.0
516 NaN 372 NaN NaN NaN 112.0 True 42.0 284.0
517 NaN 53 NaN NaN NaN 171.0 True 101.0 285.0
518 NaN 87 NaN NaN NaN 145.0 False 75.0 286.0
519 NaN 201 NaN NaN NaN 80.0 True 10.0 287.0
520 NaN 371 NaN NaN NaN 92.0 True 22.0 290.0
521 NaN 270 NaN NaN NaN 167.0 False 97.0 293.0
522 NaN 3 NaN NaN NaN 86.0 False 16.0 294.0
523 NaN 301 NaN NaN NaN 122.0 False 52.0 296.0
524 NaN 230 NaN NaN NaN 161.0 False 91.0 300.0
525 NaN 62 NaN NaN NaN 81.0 True 11.0 303.0
526 NaN 109 NaN NaN NaN 138.0 True 68.0 305.0
527 NaN 257 NaN NaN NaN 143.0 False 73.0 308.0
528 NaN 215 NaN NaN NaN 161.0 False 91.0 313.0
529 NaN 278 NaN NaN NaN 91.0 True 21.0 315.0
530 NaN 359 NaN NaN NaN 144.0 False 74.0 316.0
531 NaN 395 NaN NaN NaN 149.0 False 79.0 318.0
532 NaN 194 NaN NaN NaN 130.0 True 60.0 319.0
533 NaN 223 NaN NaN NaN 145.0 False 75.0 321.0
534 NaN 184 NaN NaN NaN 133.0 False 63.0 324.0
535 NaN 389 NaN NaN NaN 164.0 False 94.0 326.0
536 NaN 263 NaN NaN NaN 100.0 False 30.0 327.0
537 NaN 150 NaN NaN NaN 132.0 False 62.0 330.0
538 NaN 228 NaN NaN NaN 184.0 True 114.0 332.0
539 NaN 94 NaN NaN NaN 86.0 True 16.0 334.0
540 NaN 129 NaN NaN NaN 86.0 False 16.0 336.0
541 NaN 130 NaN NaN NaN 85.0 False 15.0 337.0
542 NaN 34 NaN NaN NaN 115.0 False 45.0 338.0

543 rows × 9 columns

In [207]:
df.head()
Out[207]:
end example rc start
0 113 158 False 43
1 143 46 False 73
2 134 111 False 64
3 114 358 False 44
4 162 361 False 92

Sox2 - signal

In [310]:
fpath = f"{ddir}/processed/chipnexus/motifs/sox2/modisco/multi-task.0.valid.h5"
mr = ModiscoResult(fpath)
incl = np.load(fpath+".valid.npy")
mr.plot_profiles(valid[0][incl], {"Sox2": valid[1]['sox2'][incl],
                                  "Oct4": valid[1]['oct4'][incl]},
                rc_vec=[False, True],
                start_vec = [15, 15],
                width=40,
                legend=False,
                ylim=[[0, 3.5],[0, 7.45]],
                 fpath_template='fig/icml18/motifs/sox2-signal.motif_{}',
                 n_bootstrap=100,
                seq_height=1.5,
                figsize=(6,2.5))
mr.close()

Oct4 - signal

In [311]:
fpath = f"{ddir}/processed/chipnexus/motifs/oct4/modisco/multi-task.1.valid.h5"
mr = ModiscoResult(fpath)

incl = np.load(fpath+".valid.npy")
mr.plot_profiles(valid[0][incl], {"Sox2": valid[1]['sox2'][incl],
                                  "Oct4": valid[1]['oct4'][incl]},
                rc_vec=[True, True],
                start_vec = [18, 10],
                width=40,
                legend=False,
                seq_height=1.5,
                ylim=[[0, 3.5],[0, 7.45]],
                fpath_template='fig/icml18/motifs/oct4-signal.motif_{}',
                n_limit = 35,
                n_bootstrap=100,
                figsize=(6,2.5))
mr.close()

Sox2 - counts

In [312]:
fpath = f"{ddir}/processed/chipnexus/motifs/sox2/modisco/multi-task.2.valid.h5"
mr = ModiscoResult(fpath)

incl = np.load(fpath+".valid.npy")
mr.plot_profiles(valid[0][incl], {"Sox2": valid[1]['sox2'][incl],
                                  "Oct4": valid[1]['oct4'][incl]},
                rc_vec=[True, True, True, False],
                start_vec = [23, 8, 8, 18],
                width=40,
                legend=False,
                ylim=[[0, 3.5],[0, 7.45]],
                fpath_template='fig/icml18/motifs/sox2-counts.motif_{}',
                seq_height=1.5,
                n_bootstrap=100,
                figsize=(6,2.5))
mr.close()
In [302]:
a=1

Oct4 - counts

In [313]:
fpath = f"{ddir}/processed/chipnexus/motifs/oct4/modisco/multi-task.3.valid.h5"
mr = ModiscoResult(fpath)
incl = np.load(fpath+".valid.npy")
mr.plot_profiles(valid[0][incl], {"Sox2": valid[1]['sox2'][incl],
                                  "Oct4": valid[1]['oct4'][incl]},
                 rc_vec=[False, True, False, False],
                 start_vec=[10, 18,30,20],
                 width=40,
                 legend=False,
                 ylim=[[0, 3.5],[0, 7.45]],
                 seq_height=1.5,
                 fpath_template='fig/icml18/motifs/oct4-counts.motif_{}',
                 n_bootstrap=100,
                 figsize=(6,2.5))
mr.close()

Test

In [134]:
fpath = f"{ddir}/processed/chipnexus/motifs/oct4/modisco/multi-task.0.test.h5"
mr = ModiscoResult(fpath)
incl = np.load(fpath+".test.npy")
mr.plot_profiles(test[0][incl], {"Sox2": test[1]['sox2'][incl],
                                  "Oct4": test[1]['oct4'][incl]},
                 rc_vec=[True, True, False, False],
                 #start_vec=[10, 18,20,15],
                 #width=50,
                figsize=(20,3))
mr.close()
In [138]:
fpath = f"{ddir}/processed/chipnexus/motifs/oct4/modisco/multi-task.1.test.h5"
mr = ModiscoResult(fpath)
incl = np.load(fpath+".test.npy")
mr.plot_profiles(test[0][incl], {"Sox2": test[1]['sox2'][incl],
                                  "Oct4": test[1]['oct4'][incl]},
                 rc_vec=[False, True, False, False],
                 #start_vec=[10, 18,20,15],
                 #width=50,
                figsize=(20,3))
mr.close()
In [137]:
fpath = f"{ddir}/processed/chipnexus/motifs/oct4/modisco/multi-task.1.valid.h5"
mr = ModiscoResult(fpath)
incl = np.load(fpath+".valid.npy")
mr.plot_profiles(valid[0][incl], {"Sox2": valid[1]['sox2'][incl],
                                  "Oct4": valid[1]['oct4'][incl]},
                 rc_vec=[True, True, False, False],
                 #start_vec=[10, 18,20,15],
                 #width=50,
                figsize=(20,3))
mr.close()

Sox2 - counts

In [94]:
modisco_file = f"{ddir}/processed/chipnexus/motifs/oct4/modisco/multi-task.3.valid.h5"
In [95]:
hdf5_results = h5py.File(modisco_file)

patterns = (list(hdf5_results
                 ["metacluster_idx_to_submetacluster_results"]
                 ["metacluster0"]
                 ["seqlets_to_patterns_result"]
                 ["patterns"]["all_pattern_names"]))
print(len(patterns))
pattern_grp = (hdf5_results
                 ["metacluster_idx_to_submetacluster_results"]
                 ["metacluster0"]
                 ["seqlets_to_patterns_result"]
                 ["patterns"])
4
In [100]:
for pattern_name in patterns:
    pattern = pattern_grp[pattern_name]
    print(pattern_name)
    print("total seqlets:",len(pattern["seqlets_and_alnmts"]["seqlets"]))
    background = np.array([0.27, 0.23, 0.23, 0.27])
    print("fwd:")
    viz_sequence.plot_weights(pattern["task0_contrib_scores"]["fwd"])
    viz_sequence.plot_weights(pattern["task0_hypothetical_contribs"]["fwd"])
    viz_sequence.plot_weights(viz_sequence.ic_scale(np.array(pattern["sequence"]["fwd"]),
                                                    background=background))

    print("reverse:")
    viz_sequence.plot_weights(pattern["task0_contrib_scores"]["rev"])
    viz_sequence.plot_weights(pattern["task0_hypothetical_contribs"]["rev"])
    viz_sequence.plot_weights(viz_sequence.ic_scale(np.array(pattern["sequence"]["rev"]),
                                                    background=background))
b'pattern_0'
total seqlets: 889
fwd:
reverse:
b'pattern_1'
total seqlets: 179
fwd:
reverse:
b'pattern_2'
total seqlets: 160
fwd:
reverse:
b'pattern_3'
total seqlets: 40
fwd:
reverse: