Goal

  • train the directness score

Conclusions

  • prediction accuracy ~75%
  • discovered features make sense
In [196]:
from uuid import uuid4
from basepair.math import mean
from basepair.stats import perc
from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable
from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern
from basepair.modisco.motif_clustering import to_colors, preproc_motif_table, motif_table_long, scale, preproc_df, pd_minmax_scale, pd_scale
from basepair.imports import *

model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"
In [197]:
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()

patterns = [mr.get_pattern(p) for p in mr.patterns()]

tasks = [x.split("/")[0] for x in mr.tasks()]
In [198]:
# read the property table
output_dir = Path("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile")
pattern_table = preproc_df(pd.read_csv(output_dir / "pattern_table.csv"))
# read the footprints
footprints = read_pkl(output_dir / 'footprints.pkl')

Append TE vs non-TE

In [199]:
# load annotated tables
pattern_table_nte_contrib = preproc_df(pd.read_csv(output_dir / 'motif_clustering/nte_contrib.csv'), 100)
pattern_table_nte_seq = preproc_df(pd.read_csv(output_dir / 'motif_clustering/nte_seq.csv'), 100)
pattern_table_te_contrib = preproc_df(pd.read_csv(output_dir / 'motif_clustering/te_contrib.csv'), None)
In [200]:
pn_nte = pattern_table_nte_contrib.pattern.values
pn_te = pattern_table_te_contrib.pattern.values
In [201]:
pattern_table['is_te'] = pattern_table.pattern.isin(pn_te)

Merge with the annotated csv

In [202]:
from basepair.exp.chipnexus.annotate_profile import read_annotated_csv
dfl = read_annotated_csv("../2018-10-16-ziga-all-profile.csv", suffix=' directness')
In [203]:
dfl.head()
Out[203]:
Klf4 directness Nanog directness Oct4 directness Sox2 directness pattern
0 ambiguous not-bound not-bound not-bound m9_p9
1 ambiguous not-bound not-bound not-bound m9_p8
2 not-bound not-bound ambigous not-bound m9_p7
3 not-bound not-bound not-bound not-bound m9_p6
4 not-bound direct not-bound not-bound m9_p5
In [204]:
dft = pd.merge(pattern_table, dfl, on='pattern', how='left')
In [205]:
dft.head()
Out[205]:
idx pattern n seqlets ic pwm mean Klf4 imp profile Klf4 imp counts Klf4 footprint entropydiff Klf4 footprint max Klf4 footprint strandcor Klf4 footprint counts ... Klf4 pos unimodal metacluster log n seqlets motif len motif ic is_te Klf4 directness Nanog directness Oct4 directness Sox2 directness
0 0 m0_p0 9373 0.832360 0.019664 0.016579 0.038678 0.516096 0.005113 114.674965 ... NaN 0 3.971879 16 13.317753 False not-bound indirect direct direct
1 1 m0_p1 3101 1.173885 0.021741 0.021541 0.066953 0.886055 0.005224 185.448349 ... NaN 0 3.491502 9 10.564969 False indirect indirect ambigous direct
2 2 m0_p2 1656 0.739850 0.017614 0.013870 0.077125 0.854888 0.004884 189.111115 ... NaN 0 3.219060 16 11.837593 False indirect indirect indirect ambigous
3 3 m0_p3 1024 0.748689 0.012395 0.010862 0.068053 0.890029 0.005256 169.129028 ... NaN 0 3.010300 10 7.486892 False not-bound direct not-bound not-bound
4 4 m0_p4 991 0.858039 0.008561 0.008131 0.065403 0.430162 0.005060 97.099190 ... NaN 0 2.996074 20 17.160788 False not-bound indirect direct direct

5 rows × 58 columns

Convert to the long format

In [206]:
dftl = motif_table_long(dft, tasks, index_cols=['is_te'])
In [207]:
assert dft.shape[0]*4 == dftl.shape[0]
In [208]:
dft.shape
Out[208]:
(114, 58)
In [209]:
dftl.shape
Out[209]:
(456, 22)
In [210]:
# all features in the table
list(dftl)
Out[210]:
['idx',
 'consensus',
 'n seqlets',
 'metacluster',
 'pattern',
 'motif len',
 'ic pwm mean',
 'is_te',
 'directness',
 'footprint counts',
 'footprint entropydiff',
 'footprint max',
 'footprint strandcor',
 'imp counts',
 'imp profile',
 'periodicity 10bp',
 'pos meandiff',
 'pos std',
 'pos unimodal',
 'region counts',
 'task',
 'log n seqlets']
In [211]:
# features of interest
boxplot_features = [
#  'n seqlets',
 'motif len',
 'ic pwm mean',
 'footprint counts',
 'footprint entropydiff',
 'footprint max',
 'footprint strandcor',
 'imp counts',
 'imp profile',
 'periodicity 10bp',
 #'pos meandiff',
 #'pos std',
 #'pos unimodal',
 'region counts',
 'log n seqlets'
]
In [212]:
dftl['TF'] = dftl['task']
dftl['directness'][dftl.directness == 'ambigous'] = 'ambiguous'  # fix the ambiguous typo
dftl_melt = pd.melt(dftl, id_vars=['directness', 'task', 'TF', 'pattern', 'is_te'], value_vars=boxplot_features)
dftl_melt = dftl_melt.dropna()
#dftl_melt['directness'][dftl_melt.directness == 'ambigous'] = 'ambiguous'  # fix the ambiguous typo
In [214]:
dftl.groupby('is_te').directness.value_counts()
Out[214]:
is_te  directness
False  not-bound     179
       indirect       56
       direct         48
       ambiguous      41
True   not-bound      49
       direct         31
       indirect       27
       ambiguous      21
Name: directness, dtype: int64
In [215]:
dftl_melt.head()
Out[215]:
directness task TF pattern is_te variable value
0 not-bound Klf4 Klf4 m0_p0 False motif len 16.0
1 indirect Klf4 Klf4 m0_p1 False motif len 9.0
2 indirect Klf4 Klf4 m0_p2 False motif len 16.0
3 not-bound Klf4 Klf4 m0_p3 False motif len 10.0
4 not-bound Klf4 Klf4 m0_p4 False motif len 20.0
In [216]:
dftl_melt['norm_value'] = dftl_melt.groupby(['variable', 'TF', 'is_te']).value.apply(pd_scale)

non-TE

In [25]:
plotnine.options.figure_size = (15, 7)
ggplot(aes(x='TF', y='norm_value', color='directness'), dftl_melt[~dftl_melt.is_te]) + \
  geom_boxplot() + facet_wrap("~variable", ncol=6) + \
  theme_classic() + \
  theme(axis_text_x=element_text(rotation=90, hjust=1), subplots_adjust={'wspace':0.15}) + \
  ggtitle("non-TE")
Out[25]:
<ggplot: (8789577691954)>

TE

In [26]:
plotnine.options.figure_size = (15, 7)
ggplot(aes(x='TF', y='norm_value', color='directness'), dftl_melt[dftl_melt.is_te]) + \
  geom_boxplot() + facet_wrap("~variable", ncol=6) + \
  theme_classic() + \
  theme(axis_text_x=element_text(rotation=90, hjust=1), subplots_adjust={'wspace':0.15}) + \
  ggtitle("TE")
Out[26]:
<ggplot: (8789340997454)>

Build a simple linear model

Make a one-vs-rest classifier

not-bound    2508
indirect      913
direct        869
ambiguous     682
In [185]:
from basepair.stats import ols_formula, tidy_ols
import statsmodels.api as sm
import statsmodels.formula.api as smf

def fit_multiclass_lm(dftl, multiclass_reponse, num_features, cat_features=[]):
    y = pd.get_dummies(dftl[multiclass_reponse])
    y.columns = y.columns.str.replace("-", '')
    
    out = []
    for response in y.columns:
        x = scale(dftl[num_features])

        df = pd.concat([x, dftl[cat_features], y[[response]]], axis=1)

        df.columns = df.columns.str.replace(" ", "_")
        df = df.dropna()
        results = smf.ols(ols_formula(df, response), data=df).fit()

        df = tidy_ols(results).reset_index()
        df['response'] = response
        out.append(df)
    return pd.concat(out)

Non-te

In [181]:
df_fit = fit_multiclass_lm(dftl[(~dftl.is_te) & (dftl.directness != 'ambiguous')], 
                           'directness', boxplot_features, ['TF'])

df_fit_signif = df_fit[df_fit['P>|t|'].astype(float) < 0.05]
df_fit_signif = df_fit_signif[df_fit_signif[""] != "Intercept"]  # don't show the intercept
# significant coefficients`
df_fit_signif = df_fit_signif.iloc[df_fit_signif['coef'].astype(float).abs().argsort()].iloc[::-1]  # sort by the effect size
In [182]:
df_fit_signif[df_fit_signif.response == 'direct']
Out[182]:
index coef std err t P>|t| [0.025 0.975] response
8 8 footprint_max 0.3949 0.051 7.707 0.000 0.294 0.496 direct
11 11 imp_profile 0.2224 0.048 4.645 0.000 0.128 0.317 direct
13 13 region_counts -0.1916 0.066 -2.885 0.004 -0.322 -0.061 direct
7 7 footprint_entropydiff -0.1882 0.037 -5.062 0.000 -0.261 -0.115 direct
10 10 imp_counts -0.1300 0.042 -3.126 0.002 -0.212 -0.048 direct
9 9 footprint_strandcor 0.1105 0.036 3.056 0.002 0.039 0.182 direct
6 6 footprint_counts -0.1076 0.040 -2.656 0.008 -0.187 -0.028 direct
5 5 ic_pwm_mean 0.0597 0.029 2.053 0.041 0.002 0.117 direct
In [183]:
df_fit_signif[df_fit_signif.response == 'indirect']
Out[183]:
index coef std err t P>|t| [0.025 0.975] response
1 1 TF[T.Nanog] 0.2490 0.092 2.701 0.007 0.067 0.431 indirect
11 11 imp_profile -0.1949 0.057 -3.434 0.001 -0.307 -0.083 indirect
10 10 imp_counts 0.1032 0.049 2.095 0.037 0.006 0.200 indirect
5 5 ic_pwm_mean 0.0963 0.034 2.793 0.006 0.028 0.164 indirect
14 14 log_n_seqlets 0.0928 0.025 3.736 0.000 0.044 0.142 indirect
In [184]:
df_fit_signif[df_fit_signif.response == 'notbound']
Out[184]:
index coef std err t P>|t| [0.025 0.975] response
8 8 footprint_max -0.4401 0.059 -7.418 0.000 -0.557 -0.323 notbound
1 1 TF[T.Nanog] -0.2938 0.090 -3.262 0.001 -0.471 -0.117 notbound
14 14 log_n_seqlets -0.1482 0.024 -6.111 0.000 -0.196 -0.100 notbound
5 5 ic_pwm_mean -0.1322 0.034 -3.923 0.000 -0.198 -0.066 notbound

TE

In [177]:
df_fit = fit_multiclass_lm(dftl[(dftl.is_te) & (dftl.directness != 'ambiguous')], 
                           'directness', boxplot_features, ['TF'])

df_fit_signif = df_fit[df_fit['P>|t|'].astype(float) < 0.05]
df_fit_signif = df_fit_signif[df_fit_signif[""] != "Intercept"]  # don't show the intercept
# significant coefficients`
df_fit_signif = df_fit_signif.iloc[df_fit_signif['coef'].astype(float).abs().argsort()].iloc[::-1]  # sort by the effect size
In [178]:
df_fit_signif[df_fit_signif.response == 'direct']
Out[178]:
index coef std err t P>|t| [0.025 0.975] response
9 9 footprint_strandcor 0.2033 0.081 2.516 0.014 0.043 0.364 direct
In [179]:
df_fit_signif[df_fit_signif.response == 'indirect']
Out[179]:
index coef std err t P>|t| [0.025 0.975] response
4 4 motif_len -0.1031 0.050 -2.056 0.043 -0.203 -0.004 indirect
In [180]:
df_fit_signif[df_fit_signif.response == 'notbound']
Out[180]:
index coef std err t P>|t| [0.025 0.975] response
4 4 motif_len 0.0972 0.041 2.368 0.020 0.016 0.179 notbound

Plot the accuracy of prediction

In [49]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsOneClassifier
from sklearn.preprocessing import minmax_scale, StandardScaler
from sklearn.model_selection import RepeatedKFold, cross_val_predict
from basepair.metrics import MetricsDict
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from scikitplot.metrics import plot_confusion_matrix
In [51]:
def get_Xy(dftl, multiclass_reponse, num_features, cat_features=[]):
    """Get the feature vector
    """
    y = pd.get_dummies(dftl[multiclass_reponse])
    y_idx = y.values.argmax(1)
    #y.columns = y.columns.str.replace("-", '')
    #y = dftl[multiclass_reponse]    
    X = pd.concat([dftl[num_features], pd.get_dummies(dftl[cat_features], drop_first=True)], axis=1)
    return X.values, y_idx, list(X), list(y)

Non-TE

In [146]:
x,y, x_feat, y_feat = get_Xy(dftl[(~dftl.is_te) & (dftl.directness != 'ambiguous')], 
                             'directness', boxplot_features, ['TF'])
In [147]:
model = make_pipeline(StandardScaler(), OneVsOneClassifier(LogisticRegression(C=1e6))) # logreg without regularization
In [148]:
y_pred = cross_val_predict(model, x, y, method='predict', cv=5)
In [149]:
accuracy_score(y, y_pred)
Out[149]:
0.7735191637630662
In [150]:
y_true_f = pd.Series(y).map({i: v for i,v in enumerate(y_feat)}).values
y_pred_f = pd.Series(y_pred).map({i: v for i,v in enumerate(y_feat)}).values
In [151]:
plot_confusion_matrix(y_true_f, y_pred_f);

TE

In [152]:
x,y, x_feat, y_feat = get_Xy(dftl[(dftl.is_te) & (dftl.directness != 'ambiguous')], 
                             'directness', boxplot_features, ['TF'])
In [153]:
model = make_pipeline(StandardScaler(), OneVsOneClassifier(LogisticRegression(C=1e6))) # logreg without regularization
In [154]:
y_pred = cross_val_predict(model, x, y, method='predict', cv=5)
In [155]:
accuracy_score(y, y_pred)
Out[155]:
0.5794392523364486
In [156]:
y_true_f = pd.Series(y).map({i: v for i,v in enumerate(y_feat)}).values
y_pred_f = pd.Series(y_pred).map({i: v for i,v in enumerate(y_feat)}).values
In [157]:
plot_confusion_matrix(y_true_f, y_pred_f);