Goal

  • setup the dataset for model training

TODO

  • use more advanced combinatorial features to improve the activity models
In [1]:
%matplotlib inline
from uuid import uuid4
from collections import OrderedDict
from basepair.math import mean
from basepair.stats import perc

from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable

from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_long
from basepair.imports import *
from basepair.utils import apply_parallel
import holoviews as hv
hv.extension('bokeh')
Using TensorFlow backend.

Required files

In [2]:
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"

output_dir = modisco_dir / 'interpretable-model'
gspan_dir = output_dir / 'gSpan'
gspan_output = gspan_dir / "gspan.output.txt"
In [3]:
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
tasks = [x.split("/")[0] for x in mr.tasks()]

In [4]:
from basepair.modisco.motif_clustering import to_colors, preproc_motif_table, motif_table_long, scale, preproc_df
from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals, plot_coocurence_matrix
In [5]:
# specify motifs to use in the analysis
motifs = OrderedDict([
    ("Oct4-Sox2", "m0_p0"),
    ("Oct4-Sox2-deg", "m6_p8"),
    ("Oct4", "m0_p18"),
    ("Sox2", "m0_p1"),
    ("Essrb", "m0_p2"),
    ("Nanog", "m0_p3"),
    ("Nanog-periodic", "m0_p9"),
    ("Klf4", "m2_p0"),
])
In [6]:
# Load instances
try:
    dfi = load_instances(f"{modisco_dir}/instances.subset.parq", motifs, dedup=False)
except:
    dfi = load_instances(f"{modisco_dir}/instances.parq", motifs, dedup=False)
    dfi.set_index("example_idx", inplace=True)
In [7]:
# save back to a file
In [8]:
# dfi.to_parquet(f"{modisco_dir}/instances.subset.parq")
In [9]:
!du -sh {modisco_dir}/instances.*parq
18G	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/instances.parq
721M	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/instances.subset.parq

Get the response values

  • chip-nexus counts in the center
  • H3K27ac
  • PolII
In [10]:
from basepair.imports import *
import pybedtools
from genomelake.extractors import BigwigExtractor
from basepair.extractors import StrandedBigWigExtractor, bw_extract
from kipoiseq.transforms import ResizeInterval
from basepair.modisco.table import ModiscoData
from pybedtools import BedTool
from basepair.cli.modisco import load_ranges
from basepair.plot.heatmaps import (heatmap_importance_profile, normalize, multiple_heatmap_stranded_profile,
                                    heatmap_stranded_profile)
from basepair.plot.profiles import plot_stranded_profile
import hvplot.pandas
from basepair.plots import regression_eval
In [11]:
create_tf_session(0)
Out[11]:
<tensorflow.python.client.session.Session at 0x7f1309f76c88>

Load the data

In [12]:
df = pd.read_csv(f"{ddir}/processed/chipnexus/external-data.tsv", sep='\t')
df = df.set_index('assay')
In [13]:
df
Out[13]:
axis path
assay
DNase 0 /srv/scratch/avsec/wo...
DNase 0 /srv/scratch/avsec/wo...
H3K27ac 0 /srv/scratch/avsec/wo...
... ... ...
Groseq 0 /srv/scratch/avsec/wo...
MNase-wt 0 /srv/scratch/avsec/wo...
MNase-h4 0 /srv/scratch/avsec/wo...

14 rows × 2 columns

In [14]:
dfi.head()
Out[14]:
pattern pattern_start pattern_end strand pattern_len pattern_center match_weighted match_weighted_p match_weighted_cat match_max match_max_task imp_weighted imp_weighted_p imp_weighted_cat imp_max imp_max_task seq_match seq_match_p seq_match_cat match/Klf4 match/Nanog match/Oct4 match/Sox2 imp/Klf4 imp/Nanog imp/Oct4 imp/Sox2 example_chrom example_start example_end example_strand example_interval_from_task pattern_short pattern_name pattern_start_abs pattern_end_abs
example_idx
0 metacluster_0/pattern_0 103 119 + 16 111 0.1921 0.0050 low 0.3536 Oct4 0.0713 NaN NaN 0.1006 Oct4 4.1595 0.0377 low 0.2491 -0.2061 0.3536 0.2158 0.0602 0.0785 0.1006 0.0339 chrX 143482544 143483544 * Oct4 m0_p0 Oct4-Sox2 143482647 143482663
0 metacluster_0/pattern_0 157 173 + 16 165 0.1157 0.0002 low 0.1914 Nanog 0.0195 NaN NaN 0.0381 Klf4 0.6651 0.0026 low -0.0011 0.1914 0.1436 0.0984 0.0381 0.0165 0.0159 0.0150 chrX 143482544 143483544 * Oct4 m0_p0 Oct4-Sox2 143482701 143482717
0 metacluster_0/pattern_0 263 279 - 16 271 0.1611 0.0017 low 0.2110 Sox2 0.0311 NaN NaN 0.0673 Nanog 1.9870 0.0094 low 0.0896 0.0628 0.2078 0.2110 0.0195 0.0673 0.0230 0.0241 chrX 143482544 143483544 * Oct4 m0_p0 Oct4-Sox2 143482807 143482823
0 metacluster_0/pattern_0 281 297 + 16 289 0.2917 0.0754 low 0.3629 Sox2 0.0326 NaN NaN 0.0359 Nanog 5.3535 0.0859 low 0.2637 0.2186 0.2894 0.3629 0.0266 0.0359 0.0345 0.0314 chrX 143482544 143483544 * Oct4 m0_p0 Oct4-Sox2 143482825 143482841
0 metacluster_0/pattern_0 440 456 - 16 448 0.1653 0.0023 low 0.1970 Sox2 0.3868 0.0564 low 0.9669 Nanog 0.4610 0.0019 low 0.1522 0.1794 0.1406 0.1970 0.2396 0.9669 0.1762 0.3593 chrX 143482544 143483544 * Oct4 m0_p0 Oct4-Sox2 143482984 143483000

Write the bed file

In [15]:
!mkdir -p {ddir}/processed/activity/data/
dfi.reset_index()[['example_chrom', 'example_start', 'example_end', 'example_idx']].drop_duplicates().to_csv(f"{ddir}/processed/activity/data/peak.instances.bed", sep='\t', index=False, header=False)
In [16]:
from basepair.cli.schemas import DataSpec

ds = DataSpec.load(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/dataspec.yml")
In [17]:
!head -n 2 {ddir}/processed/activity/data/peak.instances.bed
chrX	143482544	143483544	0
chr3	122145063	122146063	1

Load the bigwigs

In [18]:
assays = ['H3K27ac', 'PolII', 'Groseq']
In [19]:
def tolist(s):
    if isinstance(s, str):
        return [s]
    else:
        return list(s)
In [20]:
bigwigs = {a: tolist(df.loc[a].path) for a in assays}
In [21]:
bigwigs = {**bigwigs, **{t: [ds.task_specs[t].pos_counts, ds.task_specs[t].neg_counts] for t in ds.task_specs}}
In [22]:
bigwigs
Out[22]:
{'H3K27ac': ['/srv/scratch/avsec/workspace/chipnexus/data/raw/2018-10-13-histone-chipseq-PMID-28483418/H3K27ac_ChIP-seq_WT_rep1_blacklisted.bw',
  '/srv/scratch/avsec/workspace/chipnexus/data/raw/2018-10-13-histone-chipseq-PMID-28483418/H3K27ac_ChIP-seq_WT_rep2_blacklisted.bw'],
 'PolII': ['/srv/scratch/avsec/workspace/chipnexus/data/raw/2018-10-13-histone-chipseq-PMID-28483418/PolII_ChIP-seq_WT_rep1_blacklisted.bw',
  '/srv/scratch/avsec/workspace/chipnexus/data/raw/2018-10-13-histone-chipseq-PMID-28483418/PolII_ChIP-seq_WT_rep2_blacklisted.bw'],
 'Groseq': ['/srv/scratch/avsec/workspace/chipnexus/data/raw/2018-10-15-groseq-from-melanie/GRO-seq_WT_1_blacklisted.bw'],
 'Oct4': ['/users/avsec/workspace/basepair-workflow/data/Oct4/5prime.counts.pos.bw',
  '/users/avsec/workspace/basepair-workflow/data/Oct4/5prime.counts.neg.bw'],
 'Sox2': ['/users/avsec/workspace/basepair-workflow/data/Sox2/5prime.counts.pos.bw',
  '/users/avsec/workspace/basepair-workflow/data/Sox2/5prime.counts.neg.bw'],
 'Nanog': ['/users/avsec/workspace/basepair-workflow/data/Nanog/5prime.counts.pos.bw',
  '/users/avsec/workspace/basepair-workflow/data/Nanog/5prime.counts.neg.bw'],
 'Klf4': ['/users/avsec/workspace/basepair-workflow/data/Klf4/5prime.counts.pos.bw',
  '/users/avsec/workspace/basepair-workflow/data/Klf4/5prime.counts.neg.bw']}
In [23]:
from basepair.datasets import *
from basepair.config import valid_chr, test_chr
from kipoi.data_utils import numpy_collate
In [24]:
class ActivityDataset(Dataset):
    """
    Args:
        intervals_file: bed4 file containing chrom  start  end  name
        fasta_file: file path; Genome sequence
        track_with: a list or a dictionary of tasks
        label_dtype: label data type
        num_chr_fasta: if True, the tsv-loader will make sure that the chromosomes
          don't start with chr
    """

    def __init__(self, intervals_file,
                 fasta_file,
                 bigwigs,
                 track_width=2000,
                 incl_chromosomes=None,
                 excl_chromosomes=None,
                 num_chr_fasta=False):
        self.num_chr_fasta = num_chr_fasta
        self.intervals_file = intervals_file
        self.fasta_file = fasta_file
        self.bigwigs = bigwigs
        self.incl_chromosomes = incl_chromosomes
        self.excl_chromosomes = excl_chromosomes
        self.tasks = list(self.bigwigs)
        if isinstance(track_width, dict):
            self.track_width = track_width
        else:
            # track_width always needs to be the dictionary
            self.track_width = {t: track_width for t in self.tasks}
        

        self.tsv = BedDataset(self.intervals_file,
                              num_chr=self.num_chr_fasta,
                              bed_columns=4,
                              ignore_targets=True,
                              incl_chromosomes=incl_chromosomes,
                              excl_chromosomes=excl_chromosomes,
                             )
        self.fasta_extractor = None
        self.bigwig_extractors = None

    def __len__(self):
        return len(self.tsv)

    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            self.fasta_extractor = FastaExtractor(self.fasta_file)
            self.bigwig_extractors = {a: [BigwigExtractor(f) for f in self.bigwigs[a]]
                                      for a in self.bigwigs}

        interval, labels = self.tsv[idx]
        # interval = resize_interval(interval, self.track_width[t])
        # # Intervals need to be 1000bp wide
        # assert interval.stop - interval.start == 1000

        # Run the fasta extractor
        # seq = np.squeeze(self.fasta_extractor([interval]))
        
        resized_intervals = {t: resize_interval(deepcopy(interval), self.track_width[t])
                             for t in self.tasks}
        
        targets = {a: sum([e([resized_intervals[a]])[0]
                           for e in self.bigwig_extractors[a]]).sum()
                   for a in self.tasks}
        
        return {
            # "inputs": {"seq": seq},
            "targets": targets,
            "metadata": {
                "ranges": GenomicRanges(interval.chrom, interval.start, interval.stop, str(idx)),
                "ranges_wide": {t: GenomicRanges.from_interval(interval_wide) 
                                for t, interval_wide in resized_intervals.items()},
                "name": interval.name
            }
        }

    def get_targets(self):
        return self.tsv.get_targets()
In [31]:
track_width = {t: 2000 if t in assays else 1000 for t in bigwigs}
track_width
Out[31]:
{'H3K27ac': 2000,
 'PolII': 2000,
 'Groseq': 2000,
 'Oct4': 1000,
 'Sox2': 1000,
 'Nanog': 1000,
 'Klf4': 1000}
In [ ]:
dl = ActivityDataset(f"{ddir}/processed/activity/data/peak.instances.bed", ds.fasta_file, bigwigs, 
                     track_width=track_width,
                     excl_chromosomes=valid_chr + test_chr)

train = numpy_collate([dl[i] for i in tqdm(range(len(dl)))])

dfy = pd.DataFrame(train['targets'])
 91%|█████████▏| 36631/40136 [11:39<01:02, 56.14it/s]
In [ ]:
dl = ActivityDataset(f"{ddir}/processed/activity/data/peak.instances.bed", ds.fasta_file, bigwigs, 
                        track_width=track_width,
                        incl_chromosomes=valid_chr)

valid = numpy_collate([dl[i] for i in tqdm(range(len(dl)))])
dfy_valid = pd.DataFrame(valid['targets'])

Create features per example

In [ ]:
dfi.head()
In [ ]:
# Columns
# pattern_short, match_weighted_p, imp_weighted_p, pattern_center, 
In [ ]:
columns = ["example_idx", "pattern_name", "example_center", "imp_weighted_p", "match_weighted_p"]

values = ['a', 'b', 'c']

data = {"example_idx": [1, 1, 1],
        "pattern_name": pd.Categorical(['a', 'a', 'b'], values),
        'pattern_center': [10, 20, 50],
        'imp_weighted_p': [.5] * 3,
        'match_weighted_p': [.5] * 3}


x = pd.DataFrame(data)

# Get all features for the strongest motif
# imp_weighted_p and match_weighted_p for the strongest motif


def index_prefix(s, value):
    s = s.copy()
    s.index = value + s.index.astype(str)
    return s


def index_suffix(s, value):
    s = s.copy()
    s.index = s.index.astype(str) + value
    return s


# --------------------------------------------
# Features
def counts(x):
    return index_suffix(x['pattern_name'].value_counts(), '/counts')


def present(x):
    return index_suffix(x['pattern_name'].value_counts() > 0, '/present')


def features_max(x, max_feat='imp_weighted_p',
                 output_feat=['imp_weighted_p', 'match_weighted_p', 'pattern_center']):
    def subset_max(x):
        if len(x) == 0:
            return pd.DataFrame({x: [np.nan] for x in output_feat})
        else:
            return x.loc[[x[max_feat].idxmax()]][output_feat]

    # Select the stronges motif
    x_single = x.groupby("pattern_name").apply(subset_max).reset_index()
    del x_single['level_1']
    x_single = pd.DataFrame(x_single.set_index('pattern_name').stack(dropna=False)).T
    x_single.columns = ['%s%s' % (a, '/max/%s' % b if b else '')
                        for a, b in x_single.columns]
    return x_single.iloc[0]


# --------------------------------------------
# run for all the features
pd.concat([counts(x), present(x), features_max(x)])


# TODO - make combination of different features
In [ ]:
# make pattern_name categorical
dfi.pattern_name = pd.Categorical(dfi.pattern_name, list(motifs))
dfi.imp_weighted_p = dfi.imp_weighted_p.fillna(0)
dfi.match_weighted_p = dfi.match_weighted_p.fillna(0)
In [ ]:
x
In [ ]:
df = pd.DataFrame(pd.concat([counts(x), present(x), features_max(x)])).T
df.index = [x.index[0]]
In [ ]:
df
In [ ]:
def apply_fn(x):
    df = pd.DataFrame(pd.concat([counts(x), present(x), features_max(x)])).T
    df['example_idx'] = x['example_idx'].iloc[0]
    return df
In [ ]:
# TODO - put as a jupyter notebook magic?
from tqdm import tqdm as tqdm_cls

inst = tqdm_cls._instances
for i in range(len(inst)): inst.pop().close()
In [ ]:
X_feat = apply_parallel(dfi.reset_index()[['example_idx', 'pattern_name', 
                                          'match_weighted_p', 'imp_weighted_p', 
                                          'pattern_center'] ].groupby('example_idx'), apply_fn, -1)

TODO - save to file

Model without graph features

In [53]:
X = X_feat
In [54]:
y_cols = ['H3K27ac', 'PolII', 'Groseq', 'Oct4', 'Sox2', 'Nanog', 'Klf4']
x_cols = [c for c in X.columns if c != 'example_idx']
In [55]:
from sklearn.preprocessing import StandardScaler
In [56]:
y_train = pd.DataFrame(train['targets'])
y_train['example_idx'] = train['metadata']['name'].astype(int)
Xy_train = pd.merge(y_train, X, on='example_idx')

y = Xy_train[y_cols]
yscaler = StandardScaler()
y = yscaler.fit_transform(np.log10(y + 1).values, y=None)
x = Xy_train[x_cols]
x = x.fillna(0)
In [57]:
y_valid = pd.DataFrame(valid['targets'])
y_valid['example_idx'] = valid['metadata']['name'].astype(int)
Xy_valid = pd.merge(y_valid, X, on='example_idx')

y_valid = Xy_valid[y_cols]
y_valid = yscaler.fit_transform(np.log10(y_valid + 1).values)
x_valid = Xy_valid[x_cols]
x_valid = x_valid.fillna(0)

HDF5BatchWriter.dump(output_dir, {"inputs"

Linear model

In [52]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict
from basepair.plot.evaluate import regression_eval
In [60]:
from basepair.plot.config import paper_config
paper_config()

Linear regression

In [61]:
i = 0
for i in range(len(y_cols)):
    fig, ax = plt.subplots(figsize=(3, 3))
    print(i, y_cols[i])
    m = LinearRegression()
    m.fit(x.values, y[:,i])
    preds = m.predict(x_valid)
    regression_eval(y_valid[:,i], preds, task=y_cols[i], alpha=0.5, ax=ax);
0 H3K27ac
1 PolII
2 Groseq
3 Oct4
4 Sox2
5 Nanog
6 Klf4

Random forest

Old results for random forest without graph features

In [310]:
i = 0
for i in range(len(y_cols)):
    fig, ax = plt.subplots(figsize=(3, 3))
    print(i, y_cols[i])
    m = RandomForestRegressor(n_estimators=100, n_jobs=5)
    m.fit(x.values, y[:,i])
    preds = m.predict(x_valid)
    regression_eval(y_valid[:,i], preds, task=y_cols[i], alpha=0.5, ax=ax);
0 H3K27ac
1 PolII
2 Groseq
3 Oct4
4 Sox2
5 Nanog
6 Klf4

Add gSpan features

In [65]:
from basepair.modisco.gspan import load_gspan_as_features

gspan_dir = modisco_dir / 'interpretable-model/gSpan'
graph_file = gspan_dir / 'match!=low,imp=high,dist=10-50.data'
gspan_output = gspan_dir / "gspan.output.txt"
In [77]:
dfg = load_gspan_as_features(gspan_output, str(graph_file) + ".idx")
In [78]:
dfg.head()
Out[78]:
Oct4-Sox2,Oct4-Sox2;0-1 Oct4-Sox2,Oct4-Sox2,Oct4-Sox2-deg;0-1,1-2 Oct4-Sox2,Oct4-Sox2,Oct4-Sox2-deg,Oct4-Sox2-deg;0-1,1-2,2-3 Oct4-Sox2,Oct4-Sox2,Oct4-Sox2-deg,Oct4-Sox2-deg;0-1,0-3,1-2,2-3 Oct4-Sox2,Oct4-Sox2,Oct4-Sox2-deg,Oct4-Sox2-deg;0-1,0-3,1-2 Nanog,Oct4-Sox2;0-1 Nanog,Oct4-Sox2,Oct4-Sox2-deg;0-1,0-2 Nanog,Oct4-Sox2,Oct4-Sox2-deg,Nanog-periodic;0-1,0-2,2-3 Nanog,Oct4-Sox2,Oct4-Sox2-deg,Sox2;0-1,0-2,0-3 Nanog,Oct4-Sox2,Nanog-periodic;0-1,1-2 Nanog,Oct4-Sox2,Nanog-periodic,Oct4-Sox2-deg;0-1,1-2,2-3 Nanog,Oct4-Sox2,Nanog-periodic,Oct4-Sox2-deg;0-1,0-3,1-2,2-3 Nanog,Oct4-Sox2,Nanog-periodic,Oct4-Sox2-deg;0-1,0-3,1-2 Nanog,Oct4-Sox2,Sox2;0-1,0-2 Nanog,Oct4-Sox2-deg;0-1 Nanog,Oct4-Sox2-deg,Nanog-periodic;0-1,1-2 Nanog,Oct4-Sox2-deg,Nanog-periodic,Oct4-Sox2;0-1,1-2,2-3 Nanog,Oct4-Sox2-deg,Oct4-Sox2-deg;0-1,0-2 Nanog,Oct4-Sox2-deg,Sox2;0-1,1-2 Nanog,Oct4-Sox2-deg,Sox2;0-1,0-2 Oct4-Sox2,Oct4-Sox2-deg;0-1 Oct4-Sox2,Oct4-Sox2-deg,Oct4-Sox2-deg;0-1,1-2 Oct4-Sox2,Oct4-Sox2-deg,Oct4-Sox2-deg,Oct4-Sox2;0-1,1-2,2-3 Oct4-Sox2,Oct4-Sox2-deg,Sox2;0-1,1-2 Oct4-Sox2,Oct4-Sox2-deg,Sox2;0-1,0-2 Oct4-Sox2,Oct4-Sox2-deg,Sox2,Oct4-Sox2-deg;0-1,0-2,2-3 Oct4-Sox2-deg,Oct4-Sox2-deg;0-1 Oct4-Sox2-deg,Oct4-Sox2-deg,Sox2;0-1,1-2 Oct4-Sox2,Sox2;0-1 Oct4-Sox2,Sox2,Oct4-Sox2-deg;0-1,1-2 Oct4-Sox2-deg,Sox2;0-1 Oct4-Sox2-deg,Sox2,Sox2;0-1,1-2 Nanog-periodic,Oct4-Sox2;0-1 Nanog-periodic,Oct4-Sox2,Oct4-Sox2-deg;0-1,0-2 Nanog-periodic,Oct4-Sox2-deg;0-1 Nanog-periodic,Oct4-Sox2-deg,Sox2;0-1,0-2 Oct4,Oct4-Sox2;0-1 Oct4,Oct4-Sox2,Oct4-Sox2-deg;0-1,1-2 Oct4,Oct4-Sox2,Oct4-Sox2-deg;0-1,0-2,1-2 Oct4,Oct4-Sox2,Oct4-Sox2-deg,Oct4-Sox2-deg;0-1,0-2,1-2,2-3 Oct4,Oct4-Sox2,Oct4-Sox2-deg,Oct4-Sox2-deg;0-1,1-2,2-3 Oct4,Oct4-Sox2,Oct4-Sox2-deg;0-1,0-2 Oct4,Oct4-Sox2,Oct4-Sox2-deg,Oct4-Sox2-deg;0-1,0-2,2-3 Oct4,Oct4-Sox2,Sox2;0-1,1-2 Oct4,Oct4-Sox2,Sox2;0-1,0-2,1-2 Oct4,Oct4-Sox2,Sox2,Oct4-Sox2-deg;0-1,0-2,1-2,2-3 Oct4,Oct4-Sox2,Sox2,Oct4-Sox2-deg;0-1,1-2,2-3 Oct4,Oct4-Sox2,Sox2;0-1,0-2 Oct4,Oct4-Sox2,Sox2,Oct4-Sox2-deg;0-1,0-2,2-3 Oct4,Oct4-Sox2-deg;0-1 Oct4,Oct4-Sox2-deg,Oct4-Sox2;0-1,1-2 Oct4,Oct4-Sox2-deg,Oct4-Sox2,Oct4-Sox2-deg;0-1,1-2,1-3 Oct4,Oct4-Sox2-deg,Oct4-Sox2-deg;0-1,1-2 Essrb,Oct4-Sox2;0-1 Essrb,Oct4-Sox2,Oct4-Sox2-deg;0-1,0-2 Essrb,Nanog;0-1 Essrb,Oct4-Sox2-deg;0-1 Nanog,Nanog-periodic;0-1 Nanog,Nanog-periodic,Nanog;0-1,1-2 Nanog,Nanog-periodic,Nanog-periodic;0-1,1-2 Nanog,Nanog-periodic,Nanog-periodic,Nanog;0-1,1-2,2-3 Nanog,Nanog-periodic,Nanog-periodic;0-1,0-2 Nanog,Nanog-periodic,Sox2;0-1,1-2 Nanog,Nanog-periodic,Sox2;0-1,0-2,1-2 Nanog,Nanog-periodic,Sox2,Nanog;0-1,1-2,2-3 Nanog,Nanog-periodic,Sox2;0-1,0-2 Klf4,Oct4-Sox2;0-1 Klf4,Oct4-Sox2,Oct4;0-1,1-2 Klf4,Oct4-Sox2,Oct4-Sox2-deg;0-1,0-2 Klf4,Oct4;0-1 Klf4,Oct4-Sox2-deg;0-1 Sox2,Sox2;0-1 Nanog,Sox2;0-1 Nanog,Sox2,Nanog-periodic;0-1,1-2 Nanog,Sox2,Nanog;0-1,1-2 Klf4,Nanog;0-1 Oct4,Sox2;0-1 Oct4,Sox2,Oct4-Sox2;0-1,1-2 Oct4,Sox2,Oct4-Sox2,Oct4-Sox2-deg;0-1,1-2,1-3 Oct4,Sox2,Oct4-Sox2-deg;0-1,1-2 Nanog,Nanog;0-1 Nanog,Nanog,Nanog-periodic;0-1,1-2 Nanog,Nanog,Nanog-periodic;0-1,0-2,1-2 Nanog,Nanog,Nanog-periodic,Nanog-periodic;0-1,1-2,2-3 Nanog,Nanog,Nanog-periodic,Nanog-periodic;0-1,0-3,1-2,2-3 Nanog,Nanog,Nanog-periodic,Nanog-periodic;0-1,0-3,1-2 Nanog,Nanog,Nanog-periodic,Nanog-periodic;0-1,1-2,1-3 Nanog,Nanog,Nanog-periodic,Sox2;0-1,0-3,1-2 Nanog,Nanog,Oct4-Sox2-deg;0-1,1-2 Nanog,Nanog,Sox2;0-1,1-2 Nanog,Nanog,Sox2,Nanog-periodic;0-1,1-2,2-3 Nanog-periodic,Nanog-periodic;0-1 Klf4,Sox2;0-1 Nanog-periodic,Sox2;0-1 Nanog,Oct4;0-1 Klf4,Klf4;0-1 Essrb,Nanog-periodic;0-1 Essrb,Sox2;0-1 Essrb,Essrb;0-1 Essrb,Klf4;0-1
example_idx
1 True False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False
2 False False False False False True True False False False False False False False True False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False
3 False False False False False False False False False False False False False False False False False False False False True True False False False False True False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False
4 True True False False False False False False False False False False False False False False False False False False True False False True False False False False True True True False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False
5 False False False False False False False False False False False False False False False False False False False False False False False False False False True False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False
In [103]:
# Append to X
X = pd.merge(X_feat, dfg.reset_index(), on='example_idx', how='left')
X[dfg.columns] = X[dfg.columns].fillna(False)

Merge features with bigwig counts

Model with graph features

In [48]:
y_cols = ['H3K27ac', 'PolII', 'Groseq', 'Oct4', 'Sox2', 'Nanog', 'Klf4']
x_cols = [c for c in X.columns if c != 'example_idx']
In [49]:
from sklearn.preprocessing import StandardScaler
In [50]:
y_train = pd.DataFrame(train['targets'])
y_train['example_idx'] = train['metadata']['name'].astype(int)
Xy_train = pd.merge(y_train, X, on='example_idx')

y = Xy_train[y_cols]
yscaler = StandardScaler()
y = yscaler.fit_transform(np.log10(y + 1).values, y=None)
x = Xy_train[x_cols]
x = x.fillna(0)
In [51]:
y_valid = pd.DataFrame(valid['targets'])
y_valid['example_idx'] = valid['metadata']['name'].astype(int)
Xy_valid = pd.merge(y_valid, X, on='example_idx')

y_valid = Xy_valid[y_cols]
y_valid = yscaler.fit_transform(np.log10(y_valid + 1).values)
x_valid = Xy_valid[x_cols]
x_valid = x_valid.fillna(0)

Train the model

In [124]:
x.shape
Out[124]:
(40136, 140)

Linear regression

In [125]:
i = 0
for i in range(len(y_cols)):
    fig, ax = plt.subplots(figsize=(3, 3))
    print(i, y_cols[i])
    m = LinearRegression()
    m.fit(x.values, y[:,i])
    preds = m.predict(x_valid)
    regression_eval(y_valid[:,i], preds, task=y_cols[i], alpha=0.5, ax=ax);
0 H3K27ac
1 PolII
2 Groseq
3 Oct4
4 Sox2
5 Nanog
6 Klf4

Random forest

In [123]:
i = 0
for i in range(len(y_cols)):
    fig, ax = plt.subplots(figsize=(3, 3))
    print(i, y_cols[i])
    m = RandomForestRegressor(n_estimators=100, n_jobs=5)
    m.fit(x.values, y[:,i])
    preds = m.predict(x_valid)
    regression_eval(y_valid[:,i], preds, task=y_cols[i], alpha=0.5, ax=ax);
0 H3K27ac
1 PolII
2 Groseq
3 Oct4
4 Sox2
5 Nanog
6 Klf4

Write the co-occuring features to the hdf5

In [ ]: