Goal¶

setup the dataset for model training

TODO¶

use more advanced combinatorial features to improve the activity models

%matplotlib inline
from uuid import uuid4
from collections import OrderedDict
from basepair.math import mean
from basepair.stats import perc

from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable

from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_long
from basepair.imports import *
from basepair.utils import apply_parallel
import holoviews as hv
hv.extension('bokeh')

Using TensorFlow backend.

Required files¶

model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"

output_dir = modisco_dir / 'interpretable-model'
gspan_dir = output_dir / 'gSpan'
gspan_output = gspan_dir / "gspan.output.txt"

mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
tasks = [x.split("/")[0] for x in mr.tasks()]

from basepair.modisco.motif_clustering import to_colors, preproc_motif_table, motif_table_long, scale, preproc_df
from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals, plot_coocurence_matrix

# specify motifs to use in the analysis
motifs = OrderedDict([
    ("Oct4-Sox2", "m0_p0"),
    ("Oct4-Sox2-deg", "m6_p8"),
    ("Oct4", "m0_p18"),
    ("Sox2", "m0_p1"),
    ("Essrb", "m0_p2"),
    ("Nanog", "m0_p3"),
    ("Nanog-periodic", "m0_p9"),
    ("Klf4", "m2_p0"),
])

# Load instances
try:
    dfi = load_instances(f"{modisco_dir}/instances.subset.parq", motifs, dedup=False)
except:
    dfi = load_instances(f"{modisco_dir}/instances.parq", motifs, dedup=False)
    dfi.set_index("example_idx", inplace=True)

# save back to a file

# dfi.to_parquet(f"{modisco_dir}/instances.subset.parq")

!du -sh {modisco_dir}/instances.*parq

18G	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/instances.parq
721M	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/instances.subset.parq

Get the response values¶

chip-nexus counts in the center
H3K27ac
PolII

from basepair.imports import *
import pybedtools
from genomelake.extractors import BigwigExtractor
from basepair.extractors import StrandedBigWigExtractor, bw_extract
from kipoiseq.transforms import ResizeInterval
from basepair.modisco.table import ModiscoData
from pybedtools import BedTool
from basepair.cli.modisco import load_ranges
from basepair.plot.heatmaps import (heatmap_importance_profile, normalize, multiple_heatmap_stranded_profile,
                                    heatmap_stranded_profile)
from basepair.plot.profiles import plot_stranded_profile
import hvplot.pandas
from basepair.plots import regression_eval

create_tf_session(0)

<tensorflow.python.client.session.Session at 0x7f1309f76c88>

Load the data¶

df = pd.read_csv(f"{ddir}/processed/chipnexus/external-data.tsv", sep='\t')
df = df.set_index('assay')

df

dfi.head()

Write the bed file

!mkdir -p {ddir}/processed/activity/data/
dfi.reset_index()[['example_chrom', 'example_start', 'example_end', 'example_idx']].drop_duplicates().to_csv(f"{ddir}/processed/activity/data/peak.instances.bed", sep='\t', index=False, header=False)

from basepair.cli.schemas import DataSpec

ds = DataSpec.load(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/dataspec.yml")

!head -n 2 {ddir}/processed/activity/data/peak.instances.bed

chrX	143482544	143483544	0
chr3	122145063	122146063	1

Load the bigwigs¶

assays = ['H3K27ac', 'PolII', 'Groseq']

def tolist(s):
    if isinstance(s, str):
        return [s]
    else:
        return list(s)

bigwigs = {a: tolist(df.loc[a].path) for a in assays}

bigwigs = {**bigwigs, **{t: [ds.task_specs[t].pos_counts, ds.task_specs[t].neg_counts] for t in ds.task_specs}}

bigwigs

{'H3K27ac': ['/srv/scratch/avsec/workspace/chipnexus/data/raw/2018-10-13-histone-chipseq-PMID-28483418/H3K27ac_ChIP-seq_WT_rep1_blacklisted.bw',
  '/srv/scratch/avsec/workspace/chipnexus/data/raw/2018-10-13-histone-chipseq-PMID-28483418/H3K27ac_ChIP-seq_WT_rep2_blacklisted.bw'],
 'PolII': ['/srv/scratch/avsec/workspace/chipnexus/data/raw/2018-10-13-histone-chipseq-PMID-28483418/PolII_ChIP-seq_WT_rep1_blacklisted.bw',
  '/srv/scratch/avsec/workspace/chipnexus/data/raw/2018-10-13-histone-chipseq-PMID-28483418/PolII_ChIP-seq_WT_rep2_blacklisted.bw'],
 'Groseq': ['/srv/scratch/avsec/workspace/chipnexus/data/raw/2018-10-15-groseq-from-melanie/GRO-seq_WT_1_blacklisted.bw'],
 'Oct4': ['/users/avsec/workspace/basepair-workflow/data/Oct4/5prime.counts.pos.bw',
  '/users/avsec/workspace/basepair-workflow/data/Oct4/5prime.counts.neg.bw'],
 'Sox2': ['/users/avsec/workspace/basepair-workflow/data/Sox2/5prime.counts.pos.bw',
  '/users/avsec/workspace/basepair-workflow/data/Sox2/5prime.counts.neg.bw'],
 'Nanog': ['/users/avsec/workspace/basepair-workflow/data/Nanog/5prime.counts.pos.bw',
  '/users/avsec/workspace/basepair-workflow/data/Nanog/5prime.counts.neg.bw'],
 'Klf4': ['/users/avsec/workspace/basepair-workflow/data/Klf4/5prime.counts.pos.bw',
  '/users/avsec/workspace/basepair-workflow/data/Klf4/5prime.counts.neg.bw']}

from basepair.datasets import *
from basepair.config import valid_chr, test_chr
from kipoi.data_utils import numpy_collate

class ActivityDataset(Dataset):
    """
    Args:
        intervals_file: bed4 file containing chrom  start  end  name
        fasta_file: file path; Genome sequence
        track_with: a list or a dictionary of tasks
        label_dtype: label data type
        num_chr_fasta: if True, the tsv-loader will make sure that the chromosomes
          don't start with chr
    """

    def __init__(self, intervals_file,
                 fasta_file,
                 bigwigs,
                 track_width=2000,
                 incl_chromosomes=None,
                 excl_chromosomes=None,
                 num_chr_fasta=False):
        self.num_chr_fasta = num_chr_fasta
        self.intervals_file = intervals_file
        self.fasta_file = fasta_file
        self.bigwigs = bigwigs
        self.incl_chromosomes = incl_chromosomes
        self.excl_chromosomes = excl_chromosomes
        self.tasks = list(self.bigwigs)
        if isinstance(track_width, dict):
            self.track_width = track_width
        else:
            # track_width always needs to be the dictionary
            self.track_width = {t: track_width for t in self.tasks}
        

        self.tsv = BedDataset(self.intervals_file,
                              num_chr=self.num_chr_fasta,
                              bed_columns=4,
                              ignore_targets=True,
                              incl_chromosomes=incl_chromosomes,
                              excl_chromosomes=excl_chromosomes,
                             )
        self.fasta_extractor = None
        self.bigwig_extractors = None

    def __len__(self):
        return len(self.tsv)

    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            self.fasta_extractor = FastaExtractor(self.fasta_file)
            self.bigwig_extractors = {a: [BigwigExtractor(f) for f in self.bigwigs[a]]
                                      for a in self.bigwigs}

        interval, labels = self.tsv[idx]
        # interval = resize_interval(interval, self.track_width[t])
        # # Intervals need to be 1000bp wide
        # assert interval.stop - interval.start == 1000

        # Run the fasta extractor
        # seq = np.squeeze(self.fasta_extractor([interval]))
        
        resized_intervals = {t: resize_interval(deepcopy(interval), self.track_width[t])
                             for t in self.tasks}
        
        targets = {a: sum([e([resized_intervals[a]])[0]
                           for e in self.bigwig_extractors[a]]).sum()
                   for a in self.tasks}
        
        return {
            # "inputs": {"seq": seq},
            "targets": targets,
            "metadata": {
                "ranges": GenomicRanges(interval.chrom, interval.start, interval.stop, str(idx)),
                "ranges_wide": {t: GenomicRanges.from_interval(interval_wide) 
                                for t, interval_wide in resized_intervals.items()},
                "name": interval.name
            }
        }

    def get_targets(self):
        return self.tsv.get_targets()

track_width = {t: 2000 if t in assays else 1000 for t in bigwigs}
track_width

{'H3K27ac': 2000,
 'PolII': 2000,
 'Groseq': 2000,
 'Oct4': 1000,
 'Sox2': 1000,
 'Nanog': 1000,
 'Klf4': 1000}

dl = ActivityDataset(f"{ddir}/processed/activity/data/peak.instances.bed", ds.fasta_file, bigwigs, 
                     track_width=track_width,
                     excl_chromosomes=valid_chr + test_chr)

train = numpy_collate([dl[i] for i in tqdm(range(len(dl)))])

dfy = pd.DataFrame(train['targets'])

 91%|█████████▏| 36631/40136 [11:39<01:02, 56.14it/s]

dl = ActivityDataset(f"{ddir}/processed/activity/data/peak.instances.bed", ds.fasta_file, bigwigs, 
                        track_width=track_width,
                        incl_chromosomes=valid_chr)

valid = numpy_collate([dl[i] for i in tqdm(range(len(dl)))])
dfy_valid = pd.DataFrame(valid['targets'])

Create features per example¶

dfi.head()

# Columns
# pattern_short, match_weighted_p, imp_weighted_p, pattern_center,

columns = ["example_idx", "pattern_name", "example_center", "imp_weighted_p", "match_weighted_p"]

values = ['a', 'b', 'c']

data = {"example_idx": [1, 1, 1],
        "pattern_name": pd.Categorical(['a', 'a', 'b'], values),
        'pattern_center': [10, 20, 50],
        'imp_weighted_p': [.5] * 3,
        'match_weighted_p': [.5] * 3}


x = pd.DataFrame(data)

# Get all features for the strongest motif
# imp_weighted_p and match_weighted_p for the strongest motif


def index_prefix(s, value):
    s = s.copy()
    s.index = value + s.index.astype(str)
    return s


def index_suffix(s, value):
    s = s.copy()
    s.index = s.index.astype(str) + value
    return s


# --------------------------------------------
# Features
def counts(x):
    return index_suffix(x['pattern_name'].value_counts(), '/counts')


def present(x):
    return index_suffix(x['pattern_name'].value_counts() > 0, '/present')


def features_max(x, max_feat='imp_weighted_p',
                 output_feat=['imp_weighted_p', 'match_weighted_p', 'pattern_center']):
    def subset_max(x):
        if len(x) == 0:
            return pd.DataFrame({x: [np.nan] for x in output_feat})
        else:
            return x.loc[[x[max_feat].idxmax()]][output_feat]

    # Select the stronges motif
    x_single = x.groupby("pattern_name").apply(subset_max).reset_index()
    del x_single['level_1']
    x_single = pd.DataFrame(x_single.set_index('pattern_name').stack(dropna=False)).T
    x_single.columns = ['%s%s' % (a, '/max/%s' % b if b else '')
                        for a, b in x_single.columns]
    return x_single.iloc[0]


# --------------------------------------------
# run for all the features
pd.concat([counts(x), present(x), features_max(x)])


# TODO - make combination of different features

# make pattern_name categorical
dfi.pattern_name = pd.Categorical(dfi.pattern_name, list(motifs))
dfi.imp_weighted_p = dfi.imp_weighted_p.fillna(0)
dfi.match_weighted_p = dfi.match_weighted_p.fillna(0)

x

df = pd.DataFrame(pd.concat([counts(x), present(x), features_max(x)])).T
df.index = [x.index[0]]

df

def apply_fn(x):
    df = pd.DataFrame(pd.concat([counts(x), present(x), features_max(x)])).T
    df['example_idx'] = x['example_idx'].iloc[0]
    return df

# TODO - put as a jupyter notebook magic?
from tqdm import tqdm as tqdm_cls

inst = tqdm_cls._instances
for i in range(len(inst)): inst.pop().close()

X_feat = apply_parallel(dfi.reset_index()[['example_idx', 'pattern_name', 
                                          'match_weighted_p', 'imp_weighted_p', 
                                          'pattern_center'] ].groupby('example_idx'), apply_fn, -1)

TODO - save to file¶

Model without graph features¶

X = X_feat

y_cols = ['H3K27ac', 'PolII', 'Groseq', 'Oct4', 'Sox2', 'Nanog', 'Klf4']
x_cols = [c for c in X.columns if c != 'example_idx']

from sklearn.preprocessing import StandardScaler

y_train = pd.DataFrame(train['targets'])
y_train['example_idx'] = train['metadata']['name'].astype(int)
Xy_train = pd.merge(y_train, X, on='example_idx')

y = Xy_train[y_cols]
yscaler = StandardScaler()
y = yscaler.fit_transform(np.log10(y + 1).values, y=None)
x = Xy_train[x_cols]
x = x.fillna(0)

y_valid = pd.DataFrame(valid['targets'])
y_valid['example_idx'] = valid['metadata']['name'].astype(int)
Xy_valid = pd.merge(y_valid, X, on='example_idx')

y_valid = Xy_valid[y_cols]
y_valid = yscaler.fit_transform(np.log10(y_valid + 1).values)
x_valid = Xy_valid[x_cols]
x_valid = x_valid.fillna(0)

HDF5BatchWriter.dump(output_dir, {"inputs"

Linear model¶

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict
from basepair.plot.evaluate import regression_eval

from basepair.plot.config import paper_config
paper_config()

Linear regression¶

i = 0
for i in range(len(y_cols)):
    fig, ax = plt.subplots(figsize=(3, 3))
    print(i, y_cols[i])
    m = LinearRegression()
    m.fit(x.values, y[:,i])
    preds = m.predict(x_valid)
    regression_eval(y_valid[:,i], preds, task=y_cols[i], alpha=0.5, ax=ax);

0 H3K27ac
1 PolII
2 Groseq
3 Oct4
4 Sox2
5 Nanog
6 Klf4

Random forest¶

Old results for random forest without graph features¶

i = 0
for i in range(len(y_cols)):
    fig, ax = plt.subplots(figsize=(3, 3))
    print(i, y_cols[i])
    m = RandomForestRegressor(n_estimators=100, n_jobs=5)
    m.fit(x.values, y[:,i])
    preds = m.predict(x_valid)
    regression_eval(y_valid[:,i], preds, task=y_cols[i], alpha=0.5, ax=ax);

0 H3K27ac
1 PolII
2 Groseq
3 Oct4
4 Sox2
5 Nanog
6 Klf4

Add gSpan features¶

from basepair.modisco.gspan import load_gspan_as_features

gspan_dir = modisco_dir / 'interpretable-model/gSpan'
graph_file = gspan_dir / 'match!=low,imp=high,dist=10-50.data'
gspan_output = gspan_dir / "gspan.output.txt"

dfg = load_gspan_as_features(gspan_output, str(graph_file) + ".idx")

dfg.head()

# Append to X
X = pd.merge(X_feat, dfg.reset_index(), on='example_idx', how='left')
X[dfg.columns] = X[dfg.columns].fillna(False)

Merge features with bigwig counts¶

Model with graph features¶

y_cols = ['H3K27ac', 'PolII', 'Groseq', 'Oct4', 'Sox2', 'Nanog', 'Klf4']
x_cols = [c for c in X.columns if c != 'example_idx']

from sklearn.preprocessing import StandardScaler

y_train = pd.DataFrame(train['targets'])
y_train['example_idx'] = train['metadata']['name'].astype(int)
Xy_train = pd.merge(y_train, X, on='example_idx')

y = Xy_train[y_cols]
yscaler = StandardScaler()
y = yscaler.fit_transform(np.log10(y + 1).values, y=None)
x = Xy_train[x_cols]
x = x.fillna(0)

y_valid = pd.DataFrame(valid['targets'])
y_valid['example_idx'] = valid['metadata']['name'].astype(int)
Xy_valid = pd.merge(y_valid, X, on='example_idx')

y_valid = Xy_valid[y_cols]
y_valid = yscaler.fit_transform(np.log10(y_valid + 1).values)
x_valid = Xy_valid[x_cols]
x_valid = x_valid.fillna(0)

Train the model¶

x.shape

(40136, 140)

Linear regression¶

i = 0
for i in range(len(y_cols)):
    fig, ax = plt.subplots(figsize=(3, 3))
    print(i, y_cols[i])
    m = LinearRegression()
    m.fit(x.values, y[:,i])
    preds = m.predict(x_valid)
    regression_eval(y_valid[:,i], preds, task=y_cols[i], alpha=0.5, ax=ax);

0 H3K27ac
1 PolII
2 Groseq
3 Oct4
4 Sox2
5 Nanog
6 Klf4

Random forest¶

i = 0
for i in range(len(y_cols)):
    fig, ax = plt.subplots(figsize=(3, 3))
    print(i, y_cols[i])
    m = RandomForestRegressor(n_estimators=100, n_jobs=5)
    m.fit(x.values, y[:,i])
    preds = m.predict(x_valid)
    regression_eval(y_valid[:,i], preds, task=y_cols[i], alpha=0.5, ax=ax);

0 H3K27ac
1 PolII
2 Groseq
3 Oct4
4 Sox2
5 Nanog
6 Klf4

	axis	path
assay
DNase	0	/srv/scratch/avsec/wo...
DNase	0	/srv/scratch/avsec/wo...
H3K27ac	0	/srv/scratch/avsec/wo...
...	...	...
Groseq	0	/srv/scratch/avsec/wo...
MNase-wt	0	/srv/scratch/avsec/wo...
MNase-h4	0	/srv/scratch/avsec/wo...

	pattern	pattern_start	pattern_end	strand	pattern_len	pattern_center	match_weighted	match_weighted_p	match_weighted_cat	match_max	match_max_task	imp_weighted	imp_weighted_p	imp_weighted_cat	imp_max	imp_max_task	seq_match	seq_match_p	seq_match_cat	match/Klf4	match/Nanog	match/Oct4	match/Sox2	imp/Klf4	imp/Nanog	imp/Oct4	imp/Sox2	example_chrom	example_start	example_end	example_strand	example_interval_from_task	pattern_short	pattern_name	pattern_start_abs	pattern_end_abs
example_idx
0	metacluster_0/pattern_0	103	119	+	16	111	0.1921	0.0050	low	0.3536	Oct4	0.0713	NaN	NaN	0.1006	Oct4	4.1595	0.0377	low	0.2491	-0.2061	0.3536	0.2158	0.0602	0.0785	0.1006	0.0339	chrX	143482544	143483544	*	Oct4	m0_p0	Oct4-Sox2	143482647	143482663
0	metacluster_0/pattern_0	157	173	+	16	165	0.1157	0.0002	low	0.1914	Nanog	0.0195	NaN	NaN	0.0381	Klf4	0.6651	0.0026	low	-0.0011	0.1914	0.1436	0.0984	0.0381	0.0165	0.0159	0.0150	chrX	143482544	143483544	*	Oct4	m0_p0	Oct4-Sox2	143482701	143482717
0	metacluster_0/pattern_0	263	279	-	16	271	0.1611	0.0017	low	0.2110	Sox2	0.0311	NaN	NaN	0.0673	Nanog	1.9870	0.0094	low	0.0896	0.0628	0.2078	0.2110	0.0195	0.0673	0.0230	0.0241	chrX	143482544	143483544	*	Oct4	m0_p0	Oct4-Sox2	143482807	143482823
0	metacluster_0/pattern_0	281	297	+	16	289	0.2917	0.0754	low	0.3629	Sox2	0.0326	NaN	NaN	0.0359	Nanog	5.3535	0.0859	low	0.2637	0.2186	0.2894	0.3629	0.0266	0.0359	0.0345	0.0314	chrX	143482544	143483544	*	Oct4	m0_p0	Oct4-Sox2	143482825	143482841
0	metacluster_0/pattern_0	440	456	-	16	448	0.1653	0.0023	low	0.1970	Sox2	0.3868	0.0564	low	0.9669	Nanog	0.4610	0.0019	low	0.1522	0.1794	0.1406	0.1970	0.2396	0.9669	0.1762	0.3593	chrX	143482544	143483544	*	Oct4	m0_p0	Oct4-Sox2	143482984	143483000

	Oct4-Sox2,Oct4-Sox2;0-1	Oct4-Sox2,Oct4-Sox2,Oct4-Sox2-deg;0-1,1-2	Oct4-Sox2,Oct4-Sox2,Oct4-Sox2-deg,Oct4-Sox2-deg;0-1,1-2,2-3	Oct4-Sox2,Oct4-Sox2,Oct4-Sox2-deg,Oct4-Sox2-deg;0-1,0-3,1-2,2-3	Oct4-Sox2,Oct4-Sox2,Oct4-Sox2-deg,Oct4-Sox2-deg;0-1,0-3,1-2	Nanog,Oct4-Sox2;0-1	Nanog,Oct4-Sox2,Oct4-Sox2-deg;0-1,0-2	Nanog,Oct4-Sox2,Oct4-Sox2-deg,Nanog-periodic;0-1,0-2,2-3	Nanog,Oct4-Sox2,Oct4-Sox2-deg,Sox2;0-1,0-2,0-3	Nanog,Oct4-Sox2,Nanog-periodic;0-1,1-2	Nanog,Oct4-Sox2,Nanog-periodic,Oct4-Sox2-deg;0-1,1-2,2-3	Nanog,Oct4-Sox2,Nanog-periodic,Oct4-Sox2-deg;0-1,0-3,1-2,2-3	Nanog,Oct4-Sox2,Nanog-periodic,Oct4-Sox2-deg;0-1,0-3,1-2	Nanog,Oct4-Sox2,Sox2;0-1,0-2	Nanog,Oct4-Sox2-deg;0-1	Nanog,Oct4-Sox2-deg,Nanog-periodic;0-1,1-2	Nanog,Oct4-Sox2-deg,Nanog-periodic,Oct4-Sox2;0-1,1-2,2-3	Nanog,Oct4-Sox2-deg,Oct4-Sox2-deg;0-1,0-2	Nanog,Oct4-Sox2-deg,Sox2;0-1,1-2	Nanog,Oct4-Sox2-deg,Sox2;0-1,0-2	Oct4-Sox2,Oct4-Sox2-deg;0-1	Oct4-Sox2,Oct4-Sox2-deg,Oct4-Sox2-deg;0-1,1-2	Oct4-Sox2,Oct4-Sox2-deg,Oct4-Sox2-deg,Oct4-Sox2;0-1,1-2,2-3	Oct4-Sox2,Oct4-Sox2-deg,Sox2;0-1,1-2	Oct4-Sox2,Oct4-Sox2-deg,Sox2;0-1,0-2	Oct4-Sox2,Oct4-Sox2-deg,Sox2,Oct4-Sox2-deg;0-1,0-2,2-3	Oct4-Sox2-deg,Oct4-Sox2-deg;0-1	Oct4-Sox2-deg,Oct4-Sox2-deg,Sox2;0-1,1-2	Oct4-Sox2,Sox2;0-1	Oct4-Sox2,Sox2,Oct4-Sox2-deg;0-1,1-2	Oct4-Sox2-deg,Sox2;0-1	Oct4-Sox2-deg,Sox2,Sox2;0-1,1-2	Nanog-periodic,Oct4-Sox2;0-1	Nanog-periodic,Oct4-Sox2,Oct4-Sox2-deg;0-1,0-2	Nanog-periodic,Oct4-Sox2-deg;0-1	Nanog-periodic,Oct4-Sox2-deg,Sox2;0-1,0-2	Oct4,Oct4-Sox2;0-1	Oct4,Oct4-Sox2,Oct4-Sox2-deg;0-1,1-2	Oct4,Oct4-Sox2,Oct4-Sox2-deg;0-1,0-2,1-2	Oct4,Oct4-Sox2,Oct4-Sox2-deg,Oct4-Sox2-deg;0-1,0-2,1-2,2-3	Oct4,Oct4-Sox2,Oct4-Sox2-deg,Oct4-Sox2-deg;0-1,1-2,2-3	Oct4,Oct4-Sox2,Oct4-Sox2-deg;0-1,0-2	Oct4,Oct4-Sox2,Oct4-Sox2-deg,Oct4-Sox2-deg;0-1,0-2,2-3	Oct4,Oct4-Sox2,Sox2;0-1,1-2	Oct4,Oct4-Sox2,Sox2;0-1,0-2,1-2	Oct4,Oct4-Sox2,Sox2,Oct4-Sox2-deg;0-1,0-2,1-2,2-3	Oct4,Oct4-Sox2,Sox2,Oct4-Sox2-deg;0-1,1-2,2-3	Oct4,Oct4-Sox2,Sox2;0-1,0-2	Oct4,Oct4-Sox2,Sox2,Oct4-Sox2-deg;0-1,0-2,2-3	Oct4,Oct4-Sox2-deg;0-1	Oct4,Oct4-Sox2-deg,Oct4-Sox2;0-1,1-2	Oct4,Oct4-Sox2-deg,Oct4-Sox2,Oct4-Sox2-deg;0-1,1-2,1-3	Oct4,Oct4-Sox2-deg,Oct4-Sox2-deg;0-1,1-2	Essrb,Oct4-Sox2;0-1	Essrb,Oct4-Sox2,Oct4-Sox2-deg;0-1,0-2	Essrb,Nanog;0-1	Essrb,Oct4-Sox2-deg;0-1	Nanog,Nanog-periodic;0-1	Nanog,Nanog-periodic,Nanog;0-1,1-2	Nanog,Nanog-periodic,Nanog-periodic;0-1,1-2	Nanog,Nanog-periodic,Nanog-periodic,Nanog;0-1,1-2,2-3	Nanog,Nanog-periodic,Nanog-periodic;0-1,0-2	Nanog,Nanog-periodic,Sox2;0-1,1-2	Nanog,Nanog-periodic,Sox2;0-1,0-2,1-2	Nanog,Nanog-periodic,Sox2,Nanog;0-1,1-2,2-3	Nanog,Nanog-periodic,Sox2;0-1,0-2	Klf4,Oct4-Sox2;0-1	Klf4,Oct4-Sox2,Oct4;0-1,1-2	Klf4,Oct4-Sox2,Oct4-Sox2-deg;0-1,0-2	Klf4,Oct4;0-1	Klf4,Oct4-Sox2-deg;0-1	Sox2,Sox2;0-1	Nanog,Sox2;0-1	Nanog,Sox2,Nanog-periodic;0-1,1-2	Nanog,Sox2,Nanog;0-1,1-2	Klf4,Nanog;0-1	Oct4,Sox2;0-1	Oct4,Sox2,Oct4-Sox2;0-1,1-2	Oct4,Sox2,Oct4-Sox2,Oct4-Sox2-deg;0-1,1-2,1-3	Oct4,Sox2,Oct4-Sox2-deg;0-1,1-2	Nanog,Nanog;0-1	Nanog,Nanog,Nanog-periodic;0-1,1-2	Nanog,Nanog,Nanog-periodic;0-1,0-2,1-2	Nanog,Nanog,Nanog-periodic,Nanog-periodic;0-1,1-2,2-3	Nanog,Nanog,Nanog-periodic,Nanog-periodic;0-1,0-3,1-2,2-3	Nanog,Nanog,Nanog-periodic,Nanog-periodic;0-1,0-3,1-2	Nanog,Nanog,Nanog-periodic,Nanog-periodic;0-1,1-2,1-3	Nanog,Nanog,Nanog-periodic,Sox2;0-1,0-3,1-2	Nanog,Nanog,Oct4-Sox2-deg;0-1,1-2	Nanog,Nanog,Sox2;0-1,1-2	Nanog,Nanog,Sox2,Nanog-periodic;0-1,1-2,2-3	Nanog-periodic,Nanog-periodic;0-1	Klf4,Sox2;0-1	Nanog-periodic,Sox2;0-1	Nanog,Oct4;0-1	Klf4,Klf4;0-1	Essrb,Nanog-periodic;0-1	Essrb,Sox2;0-1	Essrb,Essrb;0-1	Essrb,Klf4;0-1
example_idx
1	True	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
2	False	False	False	False	False	True	True	False	False	False	False	False	False	False	True	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
3	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	True	True	False	False	False	False	True	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
4	True	True	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	True	False	False	True	False	False	False	False	True	True	True	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
5	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	True	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False

Goal¶

TODO¶

Required files¶

Get the response values¶

Load the data¶

Load the bigwigs¶

Create features per example¶

TODO - save to file¶

Model without graph features¶

Linear model¶

Linear regression¶

Random forest¶

Old results for random forest without graph features¶

Add gSpan features¶

Merge features with bigwig counts¶

Model with graph features¶

Train the model¶

Linear regression¶

Random forest¶

Write the co-occuring features to the hdf5¶