In [156]:
from basepair.imports import *
from basepair.plot.profiles import extract_signal
from basepair.math import softmax
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile
from basepair.plot.profiles import  plot_stranded_profile, multiple_plot_stranded_profile
from basepair.modisco.results import Seqlet, resize_seqlets
from basepair.modisco.pattern_instances import dfi2seqlets, annotate_profile
from basepair.cli.modisco import load_profiles
from basepair.BPNet import BPNetPredictor
from basepair.plot.tracks import plot_tracks, filter_tracks
from basepair.preproc import rc_seq
from copy import deepcopy
from basepair.exp.chipnexus.simulate import (insert_motif, generate_sim, plot_sim, generate_seq, 
                                             model2tasks, motif_coords, interactive_tracks, plot_motif_table,
                                             plot_sim_motif_col)

from scipy.fftpack import fft, ifft
import warnings
warnings.filterwarnings("ignore")
hv.extension('bokeh', 'matplotlib')

# interval columns in dfi
interval_cols = ['example_chrom', 'pattern_start_abs', 'pattern_end_abs']

TODO

  • [x] how were the quantiles computed? on restricted sets
  • [x] annotate the in-vivo profiles using the same metrics as in vivo
    • [X] use the average profile from modisco as the reference
    • [X] sanity check - plot the motif
  • [x] add also simulated data to the plot
  • [ ] plot the frequency domain (spacing)
  • [ ] mask TE's from the analysis

Load the required data

In [3]:
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / "modisco/all/profile/"

mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()

# Load the data
d = HDF5Reader(model_dir / "grad.all.h5")
d.open()
In [4]:
# specify motifs to use in the analysis
motifs = OrderedDict([
    ("Oct4-Sox2", "m0_p0"),
    #("Oct4-Sox2-deg", "m6_p8"),
    #("Oct4", "m0_p18"),
    ("Sox2", "m0_p1"),
    # ("Essrb", "m0_p2"),
    ("Nanog", "m0_p3"),
    #("Nanog-periodic", "m0_p9"),
    ("Klf4", "m2_p0"),
])

side_motifs = OrderedDict([
    ("Oct4-Sox2", ("m0_p0", "TTTGCATAACAA")),
    ("Sox2", ("m0_p1", "CCATTGTT")),
    # ("Essrb", ("m0_p2", "TCAAGGTCA")),
    ("Nanog", ("m0_p3", "TTGATGGC")),
    ("Klf4", ("m2_p0", "GGGTGTGG")),
])
In [5]:
# plot them
for tf, p in motifs.items():
    mr.get_pattern(longer_pattern(p)).trim_seq_ic(0.08).plot("seq");
    plt.title(tf)
TF-MoDISco is using the TensorFlow backend.
In [6]:
from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals, plot_coocurence_matrix
In [64]:
dfi_full = pd.read_parquet(f"{modisco_dir}/instances.parq", engine='fastparquet')
In [162]:
# TODO get transposable element locations
dfc_te = pd.read_csv("http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/motif_clustering/te_contrib.csv")
te_patterns = dfc_te[dfc_te['ic pwm mean'] > 0.8].pattern.map(longer_pattern).unique()
In [178]:
dfc_te.sort_values("ic pwm mean")
Out[178]:
index pattern idx n seqlets ic pwm mean Klf4 imp profile Klf4 imp counts Klf4 footprint entropydiff Klf4 footprint max Klf4 footprint strandcor ... logo_imp logo_seq Klf4/d Klf4/d_p Nanog/d Nanog/d_p Oct4/d Oct4/d_p Sox2/d Sox2/d_p
41 41 m9_p8 130 73 0.472524 0.001462 0.002748 0.242646 1.130137 0.005374 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.001462 26.086957 0.002003 23.913043 0.001655 43.478261 0.001289 41.304348
23 23 m2_p13 63 271 0.478165 0.002336 0.002966 0.108165 0.814672 0.004987 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.002336 34.782609 0.001997 21.739130 0.000730 15.217391 0.000595 13.043478
29 29 m2_p22 72 119 0.536395 0.002884 0.003745 0.298471 1.013514 0.005600 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.002884 45.652174 0.002456 32.608696 0.001495 36.956522 0.001315 43.478261
40 40 m9_p7 129 113 0.542158 0.002740 0.003618 0.233367 0.462617 0.004966 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.002740 41.304348 0.003822 39.130435 0.006226 76.086957 0.003990 52.173913
22 22 m2_p7 57 407 0.554443 0.004331 0.004289 0.116025 0.822281 0.005262 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.004331 60.869565 0.003140 34.782609 0.001447 34.782609 0.001025 34.782609
45 45 m12_p5 140 81 0.615796 -0.000106 0.001161 0.779698 0.617284 0.007167 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... -0.000106 10.869565 0.000653 8.695652 0.001368 32.608696 0.000614 17.391304
7 7 m0_p17 17 192 0.619684 0.008216 0.007460 0.331912 4.447644 0.005828 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.008216 95.652174 0.020173 93.478261 0.003874 60.869565 0.010562 93.478261
21 21 m2_p6 56 446 0.647461 0.002555 0.003414 0.139737 0.557692 0.005082 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.002555 36.956522 0.002417 30.434783 0.001055 28.260870 0.000931 30.434783
30 30 m3_p7 81 192 0.666478 0.002679 0.003353 0.281033 1.341146 0.005346 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.002679 39.130435 0.009286 54.347826 0.003372 54.347826 0.003743 50.000000
12 12 m0_p27 27 93 0.668124 0.004715 0.006502 0.313346 1.811828 0.005514 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.004715 69.565217 0.037732 100.000000 0.006294 78.260870 0.011500 95.652174
36 36 m7_p4 115 225 0.695505 0.004964 0.004443 0.375585 1.050725 0.005898 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.004964 73.913043 0.003850 41.304348 0.001011 23.913043 0.000977 32.608696
18 18 m1_p10 44 163 0.707070 0.000314 0.001748 0.152147 0.444444 0.004984 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.000314 13.043478 0.005660 47.826087 0.001020 26.086957 0.001059 36.956522
20 20 m2_p4 54 525 0.707841 0.001524 0.002870 0.223327 0.636821 0.005485 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.001524 28.260870 0.001585 19.565217 0.000747 17.391304 0.000560 10.869565
2 2 m0_p10 10 288 0.713371 0.004522 0.005118 0.223733 0.898955 0.005000 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.004522 67.391304 0.013976 65.217391 0.007566 82.608696 0.007925 76.086957
1 1 m0_p7 7 449 0.719410 0.006113 0.006041 0.135743 0.844519 0.005226 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.006113 80.434783 0.016590 82.608696 0.011334 100.000000 0.007809 73.913043
5 5 m0_p13 13 222 0.778216 0.008905 0.007992 0.306191 1.851351 0.005864 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.008905 97.826087 0.020571 95.652174 0.009740 93.478261 0.011707 100.000000
15 15 m0_p31 31 87 0.845868 0.007243 0.006276 0.371036 2.563218 0.005592 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.007243 89.130435 0.010872 58.695652 0.008968 89.130435 0.008442 80.434783
4 4 m0_p12 12 235 0.851287 0.004242 0.005045 0.192130 1.004255 0.005253 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.004242 58.695652 0.017770 84.782609 0.009833 95.652174 0.007349 67.391304
32 32 m3_p11 85 104 0.852564 0.003328 0.004784 0.318391 1.153846 0.005233 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.003328 54.347826 0.015944 80.434783 0.008928 86.956522 0.007249 63.043478
37 37 m7_p7 118 114 0.919510 0.007526 0.006481 0.387315 3.301802 0.006107 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.007526 91.304348 0.014242 67.391304 0.002700 52.173913 0.007489 69.565217
9 9 m0_p20 20 132 0.935129 0.004358 0.005421 0.295098 0.553435 0.005598 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.004358 63.043478 0.015186 73.913043 0.010313 97.826087 0.009646 84.782609
14 14 m0_p30 30 82 0.940724 0.002193 0.003036 0.359042 0.969512 0.005408 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.002193 32.608696 0.009224 52.173913 0.003859 58.695652 0.004155 54.347826
19 19 m2_p2 52 1537 0.960807 0.005555 0.005067 0.491592 1.240741 0.006543 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.005555 78.260870 0.002035 26.086957 0.000791 19.565217 0.000604 15.217391
26 26 m2_p19 69 102 1.009759 0.001179 0.002279 0.580441 0.740196 0.005669 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.001179 23.913043 0.001101 15.217391 0.000399 8.695652 0.000651 19.565217
0 0 m0_p6 6 492 1.009868 0.003056 0.004000 0.317918 0.933809 0.004924 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.003056 47.826087 0.011296 60.869565 0.006947 80.434783 0.006632 60.869565
38 38 m7_p8 119 95 1.034028 0.007549 0.005557 0.518601 3.147368 0.006006 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.007549 93.478261 0.008509 50.000000 0.002393 47.826087 0.002650 47.826087
35 35 m6_p7 108 73 1.041957 0.004813 0.005027 0.477036 1.328767 0.006002 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.004813 71.739130 0.011450 63.043478 0.003878 63.043478 0.006386 58.695652
16 16 m0_p32 32 107 1.109266 0.003139 0.004579 0.387369 1.060748 0.005652 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.003139 50.000000 0.019891 91.304348 0.004457 69.565217 0.010126 89.130435
42 42 m9_p9 131 71 1.124416 0.000353 0.001456 0.932460 0.565217 0.005840 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.000353 15.217391 0.000350 4.347826 0.001828 45.652174 0.000784 26.086957
34 34 m6_p4 105 214 1.129380 0.006838 0.006306 0.327225 2.438679 0.005793 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.006838 84.782609 0.014538 69.565217 0.002482 50.000000 0.007668 71.739130
25 25 m2_p18 68 124 1.135644 0.002843 0.003969 0.457620 1.391129 0.005199 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.002843 43.478261 0.002040 28.260870 0.000979 21.739130 0.000729 21.739130
6 6 m0_p14 14 211 1.143701 0.007159 0.006874 0.392146 4.528436 0.006088 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.007159 86.956522 0.021837 97.826087 0.003574 56.521739 0.010548 91.304348
3 3 m0_p11 11 228 1.148469 0.003160 0.004552 0.289907 1.212719 0.005060 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.003160 52.173913 0.015581 76.086957 0.009253 91.304348 0.007341 65.217391
17 17 m0_p33 33 78 1.199418 0.000682 0.002454 0.399765 0.750000 0.005493 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.000682 21.739130 0.004070 43.478261 0.001528 39.130435 0.002011 45.652174
39 39 m8_p0 121 93 1.206696 -0.000718 0.001634 0.583992 0.270115 0.004895 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... -0.000718 4.347826 0.000680 10.869565 -0.000493 2.173913 -0.000653 4.347826
13 13 m0_p29 29 172 1.223353 0.004472 0.005155 0.360993 1.209302 0.006087 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.004472 65.217391 0.018343 86.956522 0.004900 71.739130 0.009933 86.956522
8 8 m0_p19 19 185 1.256564 0.006450 0.005948 0.504628 2.095109 0.006755 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.006450 82.608696 0.015942 78.260870 0.005643 73.913043 0.008498 82.608696
11 11 m0_p25 25 145 1.270167 0.010784 0.008251 0.426524 5.565517 0.006242 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.010784 100.000000 0.019318 89.130435 0.004203 65.217391 0.011625 97.826087
31 31 m3_p9 83 77 1.272437 -0.000393 0.001500 0.573661 0.967532 0.005506 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... -0.000393 8.695652 0.004576 45.652174 0.001116 30.434783 0.000780 23.913043
10 10 m0_p21 21 125 1.274445 0.003700 0.004732 0.743217 1.496000 0.006412 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.003700 56.521739 0.014732 71.739130 0.008676 84.782609 0.008006 78.260870
43 43 m10_p0 133 151 1.291720 -0.001750 0.002236 1.092027 1.453642 0.010248 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... -0.001750 2.173913 -0.000197 2.173913 -0.000406 4.347826 -0.000774 2.173913
33 33 m3_p12 86 100 1.344406 0.005021 0.005286 0.554784 2.325000 0.006907 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.005021 76.086957 0.010152 56.521739 0.004266 67.391304 0.006091 56.521739
28 28 m2_p21 71 77 1.346272 0.002175 0.003846 0.522576 0.947368 0.005166 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.002175 30.434783 0.001514 17.391304 0.000509 10.869565 0.000261 8.695652
24 24 m2_p17 67 132 1.453209 0.000426 0.003625 1.128753 1.555556 0.009901 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.000426 19.565217 0.000494 6.521739 0.000117 6.521739 -0.000255 6.521739
44 44 m12_p4 139 81 1.602915 -0.000394 0.001060 0.876106 0.722222 0.005174 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... -0.000394 6.521739 0.003165 36.956522 0.001604 41.304348 0.001257 39.130435
27 27 m2_p20 70 94 1.726234 0.000354 0.002089 0.556684 2.277174 0.005323 ... <img width=480 src="data:image/png;base64,iVBO... <img width=480 src="data:image/png;base64,iVBO... 0.000354 17.391304 0.001063 13.043478 0.000647 13.043478 0.000825 28.260870

46 rows × 62 columns

In [163]:
dfi_te = load_instances(dfi_full[dfi_full.pattern.isin(te_patterns)], None)
dfi_te = dfi_te[(dfi_te.match_weighted_p > 0.1) & (dfi_te.seq_match > 20)]
number of deduplicatd instances: 14767 (49.96785436334721%)
In [7]:
dfi = load_instances(dfi_full, motifs)

dfi = filter_nonoverlapping_intervals(dfi)

total_examples = len(dfi.example_idx.unique())
total_examples
number of deduplicatd instances: 1193962 (39.77998370101559%)
Out[7]:
35318
In [8]:
# annotate with profiles
profiles = load_profiles(modisco_dir, model_dir/'grad.all.h5')
dfi_anno = annotate_profile(dfi, mr, profiles, profile_width=70, trim_frac=0.08)
100%|██████████| 4/4 [18:49<00:00, 282.48s/it]
In [9]:
dfi = dfi_anno

Visualize the distribution of different profile properties based on the different categories

Categories:

  • imp_weighted_cat
  • match_weighted_cat
In [10]:
dfi_subset = dfi.query('pattern_name=="Oct4-Sox2"')
In [11]:
profile_features = [f for f in dfi_subset if f.startswith("Oct4/profile") and f.endswith("_p")]
categories = ['imp_weighted_cat', 'match_weighted_cat','seq_match_cat']

dfm = dfi_subset.melt(id_vars=categories + ['id'], value_vars=profile_features)
dfm = dfm.dropna()
dfm = dfm[~dfm.variable.str.contains("match")]

dfm.value = dfm.value.astype(float)
In [12]:
dfi_subset[dfi_subset.match_weighted_p > 0.1].plot.scatter("imp_weighted_p", 'Oct4/profile_counts_p', alpha=0.05)
plt.title("Counts vs importance for match_weighted_p > 0.1");
In [13]:
ggplot(aes(x='match_weighted_cat', y='value', color='imp_weighted_cat'), dfm) + geom_boxplot() + \
    facet_grid("variable~.", scales='free_y') + theme_classic() + ggtitle("High importance ~ higher profile counts")
Out[13]:
<ggplot: (-9223363307852546679)>

Plot the profiles - sanity check

In [14]:
dfi_subset = dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)].query('match_weighted_p > .5').\
                      query('imp_weighted_p > 0').query('pattern_name=="Oct4-Sox2"')
In [15]:
seqlets = dfi2seqlets(dfi_subset)
seqlets = resize_seqlets(seqlets, 70, seqlen=1000)
seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
In [16]:
multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=np.arange(1000), figsize=(10,10));
In [17]:
# pattern lengths
dfi.groupby(['pattern_name', 'pattern_len']).size()
Out[17]:
pattern_name  pattern_len
Klf4          10             215342
Nanog         10             260569
Oct4-Sox2     16             222388
Sox2          9              329282
dtype: int64

In vivo plots

In [18]:
from basepair.modisco.pattern_instances import construct_motif_pairs
In [19]:
comp_strand_compbination = {
    "++": "--",
    "--": "++",
    "-+": "-+",
    "+-": "+-"
}

strand_combinations = ["++", "--", "+-", "-+"]
In [20]:
pairs = []
for i in range(len(motifs)):
    for j in range(i, len(motifs)):
        pairs.append([ list(motifs)[i], list(motifs)[j], ])
In [24]:
profile_mapping = {
    "Oct4-Sox2": "Oct4",
    "Sox2": "Sox2",
    "Nanog": "Nanog",
    "Klf4": "Klf4",
    "Essrb": "Oct4"
}
In [25]:
from basepair.stats import smooth_window_agg, smooth_lowess, smooth_gam
In [26]:
# plt.style.use('default')
In [27]:
in_silico = read_pkl(f"{ddir}/processed/chipnexus/simulation/spacing.pkl")
In [28]:
sim_df_d, sim_res_dict_d = in_silico
In [29]:
sim_df_d.keys()
Out[29]:
dict_keys(['Oct4-Sox2', 'Sox2', 'Essrb', 'Nanog', 'Klf4', 'Oct4-Sox2/rc', 'Sox2/rc', 'Essrb/rc', 'Nanog/rc', 'Klf4/rc'])
In [30]:
def swap_orientation(o):
    if o=='+':
        return "-"
    if o=="-":
        return "+"
    raise ValueError("")

# Very much a hacked function to map simulation to in vivo data
def get_xy_sim_single(sim_df_d, motif_pair, feature, orientation):
    # For Nanog, always explicilty swap the orientation
    orientation_pair = [orientation[0], orientation[1]]
    # HACK! Nanog orientation didn't properly match the orientation
    if motif_pair[0] == "Nanog":
        orientation_pair[0] = swap_orientation(orientation_pair[0])
    if motif_pair[1] == "Nanog":
        orientation_pair[1] = swap_orientation(orientation_pair[1])

    mp = list(deepcopy(motif_pair))
    if orientation_pair[0] == "-":
        mp[0] = mp[0] + "/rc"
    if orientation_pair[1] == "-":
        mp[1] = mp[1] + "/rc"
        
    df = sim_df_d[mp[0]]  # choose the central motif
    df = df[df.motif == mp[1]]  # choose the side motif
    
    df = df[df.distance < 150]
    
    # select the task
    df = df[df.task == profile_mapping[motif_pair[0]]]        
    return df.distance.values, df[feature].values
    
def get_xy_sim(sim_df_d, motif_pair, feature, orientation):
    x1,y1 = get_xy_sim_single(sim_df_d, motif_pair, feature, orientation)
    x2,y2 = get_xy_sim_single(sim_df_d, list(reversed(motif_pair)), feature, comp_strand_compbination[orientation])
    assert np.all(x1 == x2)
    return x1, y1, y2
In [34]:
sim_df_d['Oct4-Sox2'].head()
Out[34]:
central_motif distance imp/count imp/count_frac imp/weighted imp/weighted_frac position profile/counts profile/counts_frac profile/max profile/max_frac profile/simmetric_kl side_motif task motif
0 TTTGCATAACAA 11 0.960853 0.640532 0.982284 0.851888 511 185.774918 2.305446 12.215563 2.295509 0.171915 TTTGCATAACAA Oct4 Oct4-Sox2
1 TTTGCATAACAA 11 0.573854 0.622881 0.606464 0.780607 511 56.876183 1.919967 5.422070 2.175977 0.328106 TTTGCATAACAA Sox2 Oct4-Sox2
2 TTTGCATAACAA 11 0.613244 0.608472 0.300140 0.690976 511 37.637058 1.391823 1.561548 1.947507 0.156946 TTTGCATAACAA Nanog Oct4-Sox2
3 TTTGCATAACAA 11 0.245048 0.578328 0.280038 0.607541 511 14.795116 1.022246 0.756418 1.435025 0.107881 TTTGCATAACAA Klf4 Oct4-Sox2
4 TTTGCATAACAA 12 0.586150 0.390744 0.495432 0.429665 512 111.218529 1.380209 4.595213 0.863518 0.134452 TTTGCATAACAA Oct4 Oct4-Sox2
In [170]:
# motif_pair = ['Nanog', 'Nanog']
d_dfi_filtered=[]
for motif_pair in tqdm(pairs):
    dfi_filtered = (dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)]
                    .query('match_weighted_p > 0.2')
                    .query('imp_weighted_p > 0'))
    dftw_filt = construct_motif_pairs(dfi_filtered, motif_pair,
                                     features=['match_weighted_p','imp_weighted_p', 'imp_weighted'] + \
                                      [f for f in dfi if "profile" in f])
    
    # assure the right strand combination
    dftw_filt[dftw_filt.center_diff < 0]['strand_combination'] = dftw_filt[dftw_filt.center_diff < 0]['strand_combination'].map(comp_strand_compbination)
    dftw_filt.center_diff = np.abs(dftw_filt.center_diff)
    
    if motif_pair[0] == motif_pair[1]:
        motif_pair = [f"{m}{i}" for i,m in enumerate(motif_pair)]
    d_dfi_filtered.append((motif_pair, dftw_filt))
100%|██████████| 10/10 [00:58<00:00,  5.81s/it]

Profile

In [171]:
fig_profile, all_axes = plt.subplots(3*len(strand_combinations), len(d_dfi_filtered), figsize=(25,15), sharex=True, sharey='row')

for j, (motif_pair, dftw_filt) in enumerate(d_dfi_filtered):
    axes = all_axes[:,j]
    if motif_pair[0][:-1] == motif_pair[1][:-1]:
        axes[0].set_title("{f}<>{f}".format(f=motif_pair[0][:-1]), fontsize=7)
        motif_pair_c = [mp[:-1] for mp in motif_pair]
    else:
        motif_pair_c = motif_pair
        axes[0].set_title("<>".join(motif_pair), fontsize=7)
    dftw_filt = dftw_filt[(dftw_filt.center_diff < 150) & (dftw_filt.imp_weighted_p.max(1) > 0.3)]

    ymax = max([np.log10(1+dftw_filt[profile_mapping[mp] + "/profile_counts"]).max().max() for mp in motif_pair_c])
    ymin = min([np.log10(1+dftw_filt[profile_mapping[mp] + "/profile_counts"]).min().min() for mp in motif_pair_c])
    #ymax = dftw_filt.imp_weighted.max().max()
    #ymin = dftw_filt.imp_weighted.min().min()
    for i, sc in enumerate(strand_combinations):
        y1 = np.log10(1+ dftw_filt[dftw_filt.strand_combination==sc][profile_mapping[motif_pair_c[0]] + "/profile_counts"][motif_pair[0]])
        y2 = np.log10(1+ dftw_filt[dftw_filt.strand_combination==sc][profile_mapping[motif_pair_c[1]] + "/profile_counts"][motif_pair[1]])
        # y1 = dftw_filt[dftw_filt.strand_combination==sc]['imp_weighted'][motif_pair[0]]
        # y2 = dftw_filt[dftw_filt.strand_combination==sc]['imp_weighted'][motif_pair[1]]
        x = dftw_filt[dftw_filt.strand_combination==sc]['center_diff']

        #dm,ym,confi = average_distance(x,y, window=5)
        dm1,ym1,confi1 = smooth_lowess(x,y1, frac=0.15)
        dm2,ym2,confi2 = smooth_lowess(x,y2, frac=0.15)
        #dm,ym, confi = smooth_gam(x,y, 140, 20)

        ax = axes[3*i]
        ax.hist(dftw_filt[dftw_filt.strand_combination==sc]['center_diff'], np.arange(10, 150, 1));
        if j == 0:
            ax.set_ylabel(sc)

        # second plot
        ax = axes[3*i+1]
        ax.scatter(x,y1, alpha=0.05, s=8)
        if confi1 is not None:
            ax.fill_between(dm1, confi1[:,0], confi1[:,1], alpha=0.2)
        ax.plot(dm1, ym1, linewidth=2, alpha=0.8)
        ax.scatter(x,y2, alpha=0.05, s=8)
        if confi2 is not None:
            ax.fill_between(dm2, confi2[:,0], confi2[:,1], alpha=0.2)
        ax.plot(dm2, ym2, linewidth=2, alpha=0.8)
        if j == 0:
            ax.set_ylabel(sc)
        # third plot, simulated
        ax = axes[3*i+2]
        sim_x, sim_y1, sim_y2 = get_xy_sim(sim_df_d, motif_pair_c, 'profile/counts_frac', sc)
        ax.axhline(1, linestyle="--", color='grey', alpha=0.2)
        ax.plot(sim_x, sim_y1, linewidth=1, alpha=0.8)
        ax.plot(sim_x, sim_y2, linewidth=1, alpha=0.8)

        ax.xaxis.set_minor_locator(plt.MultipleLocator(10))
        if j == 0:
            ax.set_ylabel(sc)
    fig_profile.subplots_adjust(wspace=0, hspace=0)

Importance

In [172]:
fig_imp, all_axes = plt.subplots(3*len(strand_combinations), len(d_dfi_filtered), figsize=(25,15), sharex=True, sharey='row')

for j, (motif_pair, dftw_filt) in enumerate(d_dfi_filtered):
    axes = all_axes[:,j]
    if motif_pair[0][:-1] == motif_pair[1][:-1]:
        axes[0].set_title("{f}<>{f}".format(f=motif_pair[0][:-1]), fontsize=7)
        motif_pair_c = [mp[:-1] for mp in motif_pair]
    else:
        motif_pair_c = motif_pair
        axes[0].set_title("<>".join(motif_pair), fontsize=7)
    dftw_filt = dftw_filt[(dftw_filt.center_diff < 150) & (dftw_filt.imp_weighted_p.max(1) > 0.3)]

    #ymax = max([np.log10(1+dftw_filt[profile_mapping[mp] + "/profile_counts"]).max().max() for mp in motif_pair_c])
    #ymin = min([np.log10(1+dftw_filt[profile_mapping[mp] + "/profile_counts"]).min().min() for mp in motif_pair_c])
    ymax = dftw_filt.imp_weighted.max().max()
    ymin = dftw_filt.imp_weighted.min().min()
    for i, sc in enumerate(strand_combinations):
        #y1 = np.log10(1+ dftw_filt[dftw_filt.strand_combination==sc][profile_mapping[motif_pair_c[0]] + "/profile_counts"][motif_pair[0]])
        #y2 = np.log10(1+ dftw_filt[dftw_filt.strand_combination==sc][profile_mapping[motif_pair_c[1]] + "/profile_counts"][motif_pair[1]])
        y1 = dftw_filt[dftw_filt.strand_combination==sc]['imp_weighted'][motif_pair[0]]
        y2 = dftw_filt[dftw_filt.strand_combination==sc]['imp_weighted'][motif_pair[1]]
        x = dftw_filt[dftw_filt.strand_combination==sc]['center_diff']

        #dm,ym,confi = average_distance(x,y, window=5)
        dm1,ym1,confi1 = smooth_lowess(x,y1, frac=0.15)
        dm2,ym2,confi2 = smooth_lowess(x,y2, frac=0.15)
        #dm,ym, confi = smooth_gam(x,y, 140, 20)

        ax = axes[3*i]
        ax.hist(dftw_filt[dftw_filt.strand_combination==sc]['center_diff'], np.arange(10, 150, 1));
        if j == 0:
            ax.set_ylabel(sc)

        # second plot
        ax = axes[3*i+1]
        ax.scatter(x,y1, alpha=0.05, s=8)
        if confi1 is not None:
            ax.fill_between(dm1, confi1[:,0], confi1[:,1], alpha=0.2)
        ax.plot(dm1, ym1, linewidth=2, alpha=0.8)
        ax.scatter(x,y2, alpha=0.05, s=8)
        if confi2 is not None:
            ax.fill_between(dm2, confi2[:,0], confi2[:,1], alpha=0.2)
        ax.plot(dm2, ym2, linewidth=2, alpha=0.8)
        if j == 0:
            ax.set_ylabel(sc)
        # third plot, simulated
        ax = axes[3*i+2]
        sim_x, sim_y1, sim_y2 = get_xy_sim(sim_df_d, motif_pair_c, 'imp/weighted_frac', sc)
        ax.axhline(1, linestyle="--", color='grey', alpha=0.2)
        ax.plot(sim_x, sim_y1, linewidth=1, alpha=0.8)
        ax.plot(sim_x, sim_y2, linewidth=1, alpha=0.8)

        ax.xaxis.set_minor_locator(plt.MultipleLocator(10))
        if j == 0:
            ax.set_ylabel(sc)
    fig_imp.subplots_adjust(wspace=0, hspace=0)
In [51]:
fig_profile.savefig("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/spacing/profile_counts.pdf", dpi=400, bbox_inches='tight')

fig_profile.savefig("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/spacing/profile_counts.png", dpi=300, bbox_inches='tight')

fig_imp.savefig("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/spacing/importance_weighted.pdf", dpi=400, bbox_inches='tight')

fig_imp.savefig("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/spacing/importance_weighted.png", dpi=300, bbox_inches='tight')

Exclude TE's

In [173]:
from basepair.preproc import dfint_no_intersection
In [174]:
dfi_filtered = (dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)]
                .query('match_weighted_p > 0.2')
                .query('imp_weighted_p > 0'))
keep_nonte = dfint_no_intersection(dfi_filtered[interval_cols], dfi_te[interval_cols])
print("not overlapping TE", keep_nonte.mean())  # almost all were kept
dfi_filtered = dfi_filtered[keep_nonte]
d_dfi_filtered=[]
for motif_pair in tqdm(pairs):
    dftw_filt = construct_motif_pairs(dfi_filtered, motif_pair,
                                     features=['match_weighted_p','imp_weighted_p', 'imp_weighted'] + \
                                      [f for f in dfi if "profile" in f])
    
    # assure the right strand combination
    dftw_filt[dftw_filt.center_diff < 0]['strand_combination'] = dftw_filt[dftw_filt.center_diff < 0]['strand_combination'].map(comp_strand_compbination)
    dftw_filt.center_diff = np.abs(dftw_filt.center_diff)
    
    if motif_pair[0] == motif_pair[1]:
        motif_pair = [f"{m}{i}" for i,m in enumerate(motif_pair)]
    d_dfi_filtered.append((motif_pair, dftw_filt))
  0%|          | 0/10 [00:00<?, ?it/s]
not overlapping TE 0.9583326017102421
100%|██████████| 10/10 [00:50<00:00,  5.06s/it]

Profile

In [175]:
fig_profile, all_axes = plt.subplots(3*len(strand_combinations), len(d_dfi_filtered), figsize=(25,15), sharex=True, sharey='row')

for j, (motif_pair, dftw_filt) in enumerate(d_dfi_filtered):
    axes = all_axes[:,j]
    if motif_pair[0][:-1] == motif_pair[1][:-1]:
        axes[0].set_title("{f}<>{f}".format(f=motif_pair[0][:-1]), fontsize=7)
        motif_pair_c = [mp[:-1] for mp in motif_pair]
    else:
        motif_pair_c = motif_pair
        axes[0].set_title("<>".join(motif_pair), fontsize=7)
    dftw_filt = dftw_filt[(dftw_filt.center_diff < 150) & (dftw_filt.imp_weighted_p.max(1) > 0.3)]

    ymax = max([np.log10(1+dftw_filt[profile_mapping[mp] + "/profile_counts"]).max().max() for mp in motif_pair_c])
    ymin = min([np.log10(1+dftw_filt[profile_mapping[mp] + "/profile_counts"]).min().min() for mp in motif_pair_c])
    #ymax = dftw_filt.imp_weighted.max().max()
    #ymin = dftw_filt.imp_weighted.min().min()
    for i, sc in enumerate(strand_combinations):
        y1 = np.log10(1+ dftw_filt[dftw_filt.strand_combination==sc][profile_mapping[motif_pair_c[0]] + "/profile_counts"][motif_pair[0]])
        y2 = np.log10(1+ dftw_filt[dftw_filt.strand_combination==sc][profile_mapping[motif_pair_c[1]] + "/profile_counts"][motif_pair[1]])
        # y1 = dftw_filt[dftw_filt.strand_combination==sc]['imp_weighted'][motif_pair[0]]
        # y2 = dftw_filt[dftw_filt.strand_combination==sc]['imp_weighted'][motif_pair[1]]
        x = dftw_filt[dftw_filt.strand_combination==sc]['center_diff']

        #dm,ym,confi = average_distance(x,y, window=5)
        dm1,ym1,confi1 = smooth_lowess(x,y1, frac=0.15)
        dm2,ym2,confi2 = smooth_lowess(x,y2, frac=0.15)
        #dm,ym, confi = smooth_gam(x,y, 140, 20)

        ax = axes[3*i]
        ax.hist(dftw_filt[dftw_filt.strand_combination==sc]['center_diff'], np.arange(10, 150, 1));
        if j == 0:
            ax.set_ylabel(sc)

        # second plot
        ax = axes[3*i+1]
        ax.scatter(x,y1, alpha=0.05, s=8)
        if confi1 is not None:
            ax.fill_between(dm1, confi1[:,0], confi1[:,1], alpha=0.2)
        ax.plot(dm1, ym1, linewidth=2, alpha=0.8)
        ax.scatter(x,y2, alpha=0.05, s=8)
        if confi2 is not None:
            ax.fill_between(dm2, confi2[:,0], confi2[:,1], alpha=0.2)
        ax.plot(dm2, ym2, linewidth=2, alpha=0.8)
        if j == 0:
            ax.set_ylabel(sc)
        # third plot, simulated
        ax = axes[3*i+2]
        sim_x, sim_y1, sim_y2 = get_xy_sim(sim_df_d, motif_pair_c, 'profile/counts_frac', sc)
        ax.axhline(1, linestyle="--", color='grey', alpha=0.2)
        ax.plot(sim_x, sim_y1, linewidth=1, alpha=0.8)
        ax.plot(sim_x, sim_y2, linewidth=1, alpha=0.8)

        ax.xaxis.set_minor_locator(plt.MultipleLocator(10))
        if j == 0:
            ax.set_ylabel(sc)
    fig_profile.subplots_adjust(wspace=0, hspace=0)

Importance

In [176]:
fig_imp, all_axes = plt.subplots(3*len(strand_combinations), len(d_dfi_filtered), figsize=(25,15), sharex=True, sharey='row')

for j, (motif_pair, dftw_filt) in enumerate(d_dfi_filtered):
    axes = all_axes[:,j]
    if motif_pair[0][:-1] == motif_pair[1][:-1]:
        axes[0].set_title("{f}<>{f}".format(f=motif_pair[0][:-1]), fontsize=7)
        motif_pair_c = [mp[:-1] for mp in motif_pair]
    else:
        motif_pair_c = motif_pair
        axes[0].set_title("<>".join(motif_pair), fontsize=7)
    dftw_filt = dftw_filt[(dftw_filt.center_diff < 150) & (dftw_filt.imp_weighted_p.max(1) > 0.3)]

    #ymax = max([np.log10(1+dftw_filt[profile_mapping[mp] + "/profile_counts"]).max().max() for mp in motif_pair_c])
    #ymin = min([np.log10(1+dftw_filt[profile_mapping[mp] + "/profile_counts"]).min().min() for mp in motif_pair_c])
    ymax = dftw_filt.imp_weighted.max().max()
    ymin = dftw_filt.imp_weighted.min().min()
    for i, sc in enumerate(strand_combinations):
        #y1 = np.log10(1+ dftw_filt[dftw_filt.strand_combination==sc][profile_mapping[motif_pair_c[0]] + "/profile_counts"][motif_pair[0]])
        #y2 = np.log10(1+ dftw_filt[dftw_filt.strand_combination==sc][profile_mapping[motif_pair_c[1]] + "/profile_counts"][motif_pair[1]])
        y1 = dftw_filt[dftw_filt.strand_combination==sc]['imp_weighted'][motif_pair[0]]
        y2 = dftw_filt[dftw_filt.strand_combination==sc]['imp_weighted'][motif_pair[1]]
        x = dftw_filt[dftw_filt.strand_combination==sc]['center_diff']

        #dm,ym,confi = average_distance(x,y, window=5)
        dm1,ym1,confi1 = smooth_lowess(x,y1, frac=0.15)
        dm2,ym2,confi2 = smooth_lowess(x,y2, frac=0.15)
        #dm,ym, confi = smooth_gam(x,y, 140, 20)

        ax = axes[3*i]
        ax.hist(dftw_filt[dftw_filt.strand_combination==sc]['center_diff'], np.arange(10, 150, 1));
        if j == 0:
            ax.set_ylabel(sc)

        # second plot
        ax = axes[3*i+1]
        ax.scatter(x,y1, alpha=0.05, s=8)
        if confi1 is not None:
            ax.fill_between(dm1, confi1[:,0], confi1[:,1], alpha=0.2)
        ax.plot(dm1, ym1, linewidth=2, alpha=0.8)
        ax.scatter(x,y2, alpha=0.05, s=8)
        if confi2 is not None:
            ax.fill_between(dm2, confi2[:,0], confi2[:,1], alpha=0.2)
        ax.plot(dm2, ym2, linewidth=2, alpha=0.8)
        if j == 0:
            ax.set_ylabel(sc)
        # third plot, simulated
        ax = axes[3*i+2]
        sim_x, sim_y1, sim_y2 = get_xy_sim(sim_df_d, motif_pair_c, 'imp/weighted_frac', sc)
        ax.axhline(1, linestyle="--", color='grey', alpha=0.2)
        ax.plot(sim_x, sim_y1, linewidth=1, alpha=0.8)
        ax.plot(sim_x, sim_y2, linewidth=1, alpha=0.8)

        ax.xaxis.set_minor_locator(plt.MultipleLocator(10))
        if j == 0:
            ax.set_ylabel(sc)
fig_imp.subplots_adjust(wspace=0, hspace=0)
In [177]:
fig_profile.savefig("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/spacing/non-TE.profile_counts.pdf", dpi=400, bbox_inches='tight')

fig_profile.savefig("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/spacing/non-TE.profile_counts.png", dpi=300, bbox_inches='tight')

fig_imp.savefig("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/spacing/non-TE.importance_weighted.pdf", dpi=400, bbox_inches='tight')

fig_imp.savefig("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/spacing/non-TE.importance_weighted.png", dpi=300, bbox_inches='tight')