Goal

test modisco table

In [1]:
from basepair.modisco.table import ModiscoData, modisco_table, write_modisco_table
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
In [2]:
from basepair.config import get_data_dir
from pathlib import Path
ddir = get_data_dir()
In [3]:
ls {model_dir}
ls: cannot access '{model_dir}': No such file or directory
In [5]:
%%time
# make it first for the easier case
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / "modisco/valid"

data = ModiscoData.load(modisco_dir, model_dir / "grad.valid.h5")
100%|██████████| 70/70 [04:08<00:00,  3.55s/it]
CPU times: user 2min 23s, sys: 2min 25s, total: 4min 48s
Wall time: 4min 48s

In [6]:
%%time
df = modisco_table(data)
100%|██████████| 70/70 [00:37<00:00,  1.87it/s]
CPU times: user 31.4 s, sys: 32.1 s, total: 1min 3s
Wall time: 37.5 s
/users/avsec/workspace/basepair/basepair/modisco/table.py:185: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access
  ("consensus", "consensus sequence (tallest letters from logo pwm)"),
In [7]:
output_dir = "/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/valid"
report_url = 'http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/valid/results.html'
write_modisco_table(df, Path(output_dir), report_url)
In [263]:
import seaborn as sns
cm = sns.light_palette("green", as_cmap=True)
# leverage pandas style to color cells according to values
# https://pandas.pydata.org/pandas-docs/stable/style.html
s = df.style.background_gradient(cmap=cm).set_precision(3)
In [147]:
def modisco_table(modisco_dir, imp_scores, output_dir, report_url=None):
    """Write the pattern table to as .html and .csv
    """
    data = ModiscoData.load(modisco_dir, imp_scores)
    df = modisco_table(data, report_url=None)
    write_modisco_table(df, output_dir)
In [239]:
import seaborn as sns
cm = sns.light_palette("green", as_cmap=True)
In [244]:
'class="data' in s.render()
Out[244]:
True
In [231]:
a = df.style.bar(subset=df.columns[df.dtypes == float], 
             align='mid', color=['#d65f5f', '#5fba7d'])
In [233]:
table = a.render(table_id="asd")
In [219]:
from IPython.display import HTML

def hover(hover_color="#ffff99"):
    return dict(selector="tr:hover",
                props=[("background-color", "%s" % hover_color)])

styles = [
    hover(),
    dict(selector="th", props=[("font-size", "150%"),
                               ("text-align", "center")]),
    dict(selector="caption", props=[("caption-side", "bottom")])
]
In [149]:
from scipy.stats import entropy

from vdom.helpers import a
In [136]:
p = np.ones(100)
In [137]:
entropy(p)
Out[137]:
4.60517018598809
In [138]:
entropy(p/p.sum())
Out[138]:
4.605170185988092

TODO

  • try to smooth the signal and then compute the entropy
  • compute the cross-correlation
    • allow for shit
  • pos_mean -> dist from the center
In [18]:
import matplotlib.pyplot as plt
from scipy.stats import entropy
from basepair.plot.profiles import plot_stranded_profile
In [99]:
import numpy as np
In [178]:
pattern = "metacluster_4/pattern_0"
task = "Sox2"
In [179]:
agg_profile = data.get_profile_wide(pattern, task).mean(axis=0)
agg_profile_norm = agg_profile / agg_profile.sum(axis=0, keepdims=True)
In [180]:
task_idx = data.get_peak_task_idx(task)
positions = np.array([s.center() for s in data.get_seqlets(pattern)
                      if s.seqname in task_idx])
In [181]:
values = positions
In [192]:
from scipy.stats import gaussian_kde
from scipy.signal import argrelextrema
density = gaussian_kde(values)
xs = np.linspace(0, 1000, 100)
smooth_hist = density(xs)
maxima = argrelextrema(smooth_hist, np.greater)
In [193]:
maxima
Out[193]:
(array([58]),)
In [194]:
np.sum(smooth_hist[maxima[0]] > 0.5 * smooth_hist.max())
Out[194]:
1
In [229]:
import basepair
from basepair.plot import vdom
from basepair.plot.vdom import vdom_modisco
from basepair.plot.vdom import render_datatable
from basepair.utils import read_pkl, write_pkl
In [1]:
write_pkl(df, "/tmp/df.pkl")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-bc569fc06329> in <module>()
----> 1 write_pkl(df, "/tmp/df.pkl")

NameError: name 'write_pkl' is not defined
In [12]:
df = read_pkl("/tmp/df.pkl")
In [25]:
df.insert(0, 'idx', df.index)  # add also idx to the front
In [13]:
from IPython.display import HTML
In [14]:
HTML(df.iloc[0][2])
Out[14]:
In [121]:
from basepair.plot.vdom import df2html, render_datatable, write_datatable_html
In [16]:
df2html(df)[:100]
Out[16]:
'<table border="1" id="table_id" style="width:100%" class="dataframe display nowrap">\n  <thead>\n    <'

TODO - investigate the results

  • write a function bring-to-front
  • check that you get the top motifs correctly
In [39]:
br().to_html()
Out[39]:
'<br></br>'
In [47]:
df.iloc[:,2] = df.iloc[:,2].str.replace("<img ", "<img width=100 ")
In [212]:
df.columns = [c.replace(" ", "<br>") for c in df.columns]
In [119]:
from IPython.display import HTML, Javascript
In [120]:
display(Javascript(""" $(document).ready( function () {
    var table = $('#table_id').DataTable({
         scrollX: true,
         scrollY: '50vh',
         scrollCollapse: true,
         paging: false,
         colReorder: true,
         columnDefs: [
            { orderable: false, targets: 0 },
            { orderable: false, targets: 1 }
        ],
        ordering: [[ 1, 'asc' ]],
        colReorder: {
            fixedColumnsLeft: 1,
            fixedColumnsRight: 0
        }
    });
    
    new $.fn.dataTable.FixedColumns( table, {
        leftColumns: 3,
        rightColumns: 0
    } );
    
    $('#table_id tbody').on( 'click', 'tr', function () {
        $(this).toggleClass('selected');
    } );
    } );"""))
In [44]:
write_datatable_html(df, "table.html")
In [22]:
from basepair.modisco.table import *
In [19]:
pattern = "metacluster_0/pattern_0"
In [28]:
pattern_task_features(pattern, "Oct4", data)
Out[28]:
[('Oct4_imp_profile', 0.042554803),
 ('Oct4_imp_counts', 0.05932223),
 ('Oct4_footprint_entropy', 7.4895344),
 ('Oct4_footprint_max', 0.059973598),
 ('Oct4_footprint_counts', 119.887344),
 ('Oct4_region_counts', 775.7744),
 ('Oct4_pos_mean', 500.5015216068168),
 ('Oct4_pos_std', 89.12964135382359),
 ('Oct4_pos_unimodal', False)]
In [8]:
pssm = data.mr.get_pssm('metacluster_0', 'pattern_0', trim_frac=data.trim_frac)
In [9]:
pssm.shape
Out[9]:
(14, 4)