from uuid import uuid4
from basepair.math import mean
from basepair.stats import perc
from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable
from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern
from basepair.modisco.motif_clustering import to_colors, preproc_motif_table, motif_table_long, scale, preproc_df, pd_minmax_scale, pd_scale
from basepair.imports import *
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
patterns = [mr.get_pattern(p) for p in mr.patterns()]
tasks = [x.split("/")[0] for x in mr.tasks()]
# read the property table
output_dir = Path("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile")
pattern_table = preproc_df(pd.read_csv(output_dir / "pattern_table.csv"))
# read the footprints
footprints = read_pkl(output_dir / 'footprints.pkl')
# load annotated tables
pattern_table_nte_contrib = preproc_df(pd.read_csv(output_dir / 'motif_clustering/nte_contrib.csv'), 100)
pattern_table_nte_seq = preproc_df(pd.read_csv(output_dir / 'motif_clustering/nte_seq.csv'), 100)
pattern_table_te_contrib = preproc_df(pd.read_csv(output_dir / 'motif_clustering/te_contrib.csv'), None)
pn_nte = pattern_table_nte_contrib.pattern.values
pn_te = pattern_table_te_contrib.pattern.values
pattern_table['is_te'] = pattern_table.pattern.isin(pn_te)
from basepair.exp.chipnexus.annotate_profile import read_annotated_csv
dfl = read_annotated_csv("../2018-10-16-ziga-all-profile.csv", suffix=' directness')
dfl.head()
dft = pd.merge(pattern_table, dfl, on='pattern', how='left')
dft.head()
dftl = motif_table_long(dft, tasks, index_cols=['is_te'])
assert dft.shape[0]*4 == dftl.shape[0]
dft.shape
dftl.shape
# all features in the table
list(dftl)
# features of interest
boxplot_features = [
# 'n seqlets',
'motif len',
'ic pwm mean',
'footprint counts',
'footprint entropydiff',
'footprint max',
'footprint strandcor',
'imp counts',
'imp profile',
'periodicity 10bp',
#'pos meandiff',
#'pos std',
#'pos unimodal',
'region counts',
'log n seqlets'
]
dftl['TF'] = dftl['task']
dftl['directness'][dftl.directness == 'ambigous'] = 'ambiguous' # fix the ambiguous typo
dftl_melt = pd.melt(dftl, id_vars=['directness', 'task', 'TF', 'pattern', 'is_te'], value_vars=boxplot_features)
dftl_melt = dftl_melt.dropna()
#dftl_melt['directness'][dftl_melt.directness == 'ambigous'] = 'ambiguous' # fix the ambiguous typo
dftl.groupby('is_te').directness.value_counts()
dftl_melt.head()
dftl_melt['norm_value'] = dftl_melt.groupby(['variable', 'TF', 'is_te']).value.apply(pd_scale)
plotnine.options.figure_size = (15, 7)
ggplot(aes(x='TF', y='norm_value', color='directness'), dftl_melt[~dftl_melt.is_te]) + \
geom_boxplot() + facet_wrap("~variable", ncol=6) + \
theme_classic() + \
theme(axis_text_x=element_text(rotation=90, hjust=1), subplots_adjust={'wspace':0.15}) + \
ggtitle("non-TE")
plotnine.options.figure_size = (15, 7)
ggplot(aes(x='TF', y='norm_value', color='directness'), dftl_melt[dftl_melt.is_te]) + \
geom_boxplot() + facet_wrap("~variable", ncol=6) + \
theme_classic() + \
theme(axis_text_x=element_text(rotation=90, hjust=1), subplots_adjust={'wspace':0.15}) + \
ggtitle("TE")
Make a one-vs-rest classifier
not-bound 2508
indirect 913
direct 869
ambiguous 682
from basepair.stats import ols_formula, tidy_ols
import statsmodels.api as sm
import statsmodels.formula.api as smf
def fit_multiclass_lm(dftl, multiclass_reponse, num_features, cat_features=[]):
y = pd.get_dummies(dftl[multiclass_reponse])
y.columns = y.columns.str.replace("-", '')
out = []
for response in y.columns:
x = scale(dftl[num_features])
df = pd.concat([x, dftl[cat_features], y[[response]]], axis=1)
df.columns = df.columns.str.replace(" ", "_")
df = df.dropna()
results = smf.ols(ols_formula(df, response), data=df).fit()
df = tidy_ols(results).reset_index()
df['response'] = response
out.append(df)
return pd.concat(out)
df_fit = fit_multiclass_lm(dftl[(~dftl.is_te) & (dftl.directness != 'ambiguous')],
'directness', boxplot_features, ['TF'])
df_fit_signif = df_fit[df_fit['P>|t|'].astype(float) < 0.05]
df_fit_signif = df_fit_signif[df_fit_signif[""] != "Intercept"] # don't show the intercept
# significant coefficients`
df_fit_signif = df_fit_signif.iloc[df_fit_signif['coef'].astype(float).abs().argsort()].iloc[::-1] # sort by the effect size
df_fit_signif[df_fit_signif.response == 'direct']
df_fit_signif[df_fit_signif.response == 'indirect']
df_fit_signif[df_fit_signif.response == 'notbound']
df_fit = fit_multiclass_lm(dftl[(dftl.is_te) & (dftl.directness != 'ambiguous')],
'directness', boxplot_features, ['TF'])
df_fit_signif = df_fit[df_fit['P>|t|'].astype(float) < 0.05]
df_fit_signif = df_fit_signif[df_fit_signif[""] != "Intercept"] # don't show the intercept
# significant coefficients`
df_fit_signif = df_fit_signif.iloc[df_fit_signif['coef'].astype(float).abs().argsort()].iloc[::-1] # sort by the effect size
df_fit_signif[df_fit_signif.response == 'direct']
df_fit_signif[df_fit_signif.response == 'indirect']
df_fit_signif[df_fit_signif.response == 'notbound']
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsOneClassifier
from sklearn.preprocessing import minmax_scale, StandardScaler
from sklearn.model_selection import RepeatedKFold, cross_val_predict
from basepair.metrics import MetricsDict
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from scikitplot.metrics import plot_confusion_matrix
def get_Xy(dftl, multiclass_reponse, num_features, cat_features=[]):
"""Get the feature vector
"""
y = pd.get_dummies(dftl[multiclass_reponse])
y_idx = y.values.argmax(1)
#y.columns = y.columns.str.replace("-", '')
#y = dftl[multiclass_reponse]
X = pd.concat([dftl[num_features], pd.get_dummies(dftl[cat_features], drop_first=True)], axis=1)
return X.values, y_idx, list(X), list(y)
x,y, x_feat, y_feat = get_Xy(dftl[(~dftl.is_te) & (dftl.directness != 'ambiguous')],
'directness', boxplot_features, ['TF'])
model = make_pipeline(StandardScaler(), OneVsOneClassifier(LogisticRegression(C=1e6))) # logreg without regularization
y_pred = cross_val_predict(model, x, y, method='predict', cv=5)
accuracy_score(y, y_pred)
y_true_f = pd.Series(y).map({i: v for i,v in enumerate(y_feat)}).values
y_pred_f = pd.Series(y_pred).map({i: v for i,v in enumerate(y_feat)}).values
plot_confusion_matrix(y_true_f, y_pred_f);
x,y, x_feat, y_feat = get_Xy(dftl[(dftl.is_te) & (dftl.directness != 'ambiguous')],
'directness', boxplot_features, ['TF'])
model = make_pipeline(StandardScaler(), OneVsOneClassifier(LogisticRegression(C=1e6))) # logreg without regularization
y_pred = cross_val_predict(model, x, y, method='predict', cv=5)
accuracy_score(y, y_pred)
y_true_f = pd.Series(y).map({i: v for i,v in enumerate(y_feat)}).values
y_pred_f = pd.Series(y_pred).map({i: v for i,v in enumerate(y_feat)}).values
plot_confusion_matrix(y_true_f, y_pred_f);