library(tidyverse)
library(tictoc)
library(pheatmap)
library(moments)
library(impute)
library(pROC)
library(caret)
library(sparklyr)
library(glmnet)
library(tensorflow)
library(keras)
# Load step 1 output
load('dat_train_cleaned_imputed.rdata')
caretConfusionMat_F1_Acc <- function(ref, predProb){
# Use caret::confusionMatrix() and collect output of the confusion matrix,
# F1-score, and accuracy.
# Predicted labels are defined by (predProb > 0.5).
# 'positive' is set to 'TRUE'.
conf <- confusionMatrix(data = as.factor(predProb > 0.5),
reference = as.factor(ref),
mode = 'prec_recall',
positive = 'TRUE')
F1score = conf$byClass['F1']
Acc <- conf$overall['Accuracy']
out <- list(conf = conf, F1score = F1score, Accuracy = Acc)
}
kerasMlp1 <- function(x_train, y_train){
# 1 hidden layer with the same number of nodes as the input layer
numFeat <- ncol(x_train)
numUnits <- round(numFeat * 1)
# Create model
model <- keras_model_sequential()
# Define and compile the model
model %>%
layer_dense(units = numUnits, activation = 'tanh', input_shape = c(numFeat)) %>%
layer_dropout(rate = 0.5) %>%
layer_dense(units = 1, activation = 'sigmoid') %>%
compile(
loss = 'binary_crossentropy',
optimizer = optimizer_sgd(learning_rate = 0.01, momentum = 0.1),
metrics = c('accuracy')
)
print(model)
# Train
tic()
history <- model %>% fit(x_train, y_train, epochs = 20, batch_size = 10,
validation_split = 0.2, verbose = 2)
toc(log=T)
# Evaluate for train data
(kerasMlpEval <- model %>% evaluate(x_train, y_train))
fittedvalues <- model %>% predict(x_train)
(trainAUC <- auc(y_train, c(fittedvalues)))
out <- list(model=model, history=history, kerasMlpEval=kerasMlpEval,
fittedvalues=fittedvalues, trainAUC=trainAUC)
return(out)
}
kerasMlp2 <- function(x_train, y_train){
# 2 hidden layers where the number of nodes is 3 * (num of input nodes)
numFeat <- ncol(x_train)
numUnits <- round(numFeat * 3)
# Create model
model <- keras_model_sequential()
# Define and compile the model
model %>%
layer_dense(units = numUnits, activation = 'tanh', input_shape = c(numFeat)) %>%
layer_dropout(rate = 0.5) %>%
layer_dense(units = numUnits, activation = 'tanh') %>%
layer_dropout(rate = 0.5) %>%
layer_dense(units = 1, activation = 'sigmoid') %>%
compile(
loss = 'binary_crossentropy',
optimizer = optimizer_sgd(learning_rate = 0.01, momentum = 0.1),
metrics = c('accuracy')
)
print(model)
# Train
tic()
history <- model %>% fit(x_train, y_train, epochs = 20, batch_size = 10,
validation_split = 0.2, verbose = 2)
toc(log=T)
# Evaluate for train data
(kerasMlpEval <- model %>% evaluate(x_train, y_train))
fittedvalues <- model %>% predict(x_train)
(trainAUC <- auc(y_train, c(fittedvalues)))
out <- list(model=model, history=history, kerasMlpEval=kerasMlpEval,
fittedvalues=fittedvalues, trainAUC=trainAUC)
return(out)
}
# Split data into train and validation datasets
set.seed(0325)
validation_set_prop <- 0.25
val_set_size <- round(nrow(dat_train_sc) * validation_set_prop)
val_set_ids <- sample.int(nrow(dat_train_sc), val_set_size)
dat_train_sc_train <- dat_train_sc[-val_set_ids, ] # for model training
dat_train_sc_val <- dat_train_sc[val_set_ids, ] # for model evaluation and
# tuning hyper parameters
# Define features and model formulas
features <- setdiff(colnames(dat_train_sc), "y")
features
formula1 <- as.formula(paste0('y ~ ', paste0(features, collapse = ' + ')))
formula1
# Fit glm()
lr_full <- glm(formula1, data = dat_train_sc_train, family = binomial(link = 'logit'))
summary(lr_full)
indSig <- (coef(summary(lr_full))[, 4] < 0.05)
(sigFeatures <- rownames(coef(summary(lr_full)))[indSig])
# Significant features shown (P<0.05)
# Prediction performance
pred <- predict(lr_full, newdata = dat_train_sc_val, type = 'response')
(lr_full$AUC <- auc(dat_train_sc_val$y, pred))
# Area under the curve: 0.7595
out <- caretConfusionMat_F1_Acc((dat_train_sc_val$y == 1), pred)
out$conf
(lr_full$Accuracy <- out$Accuracy)
Using a logistic regression with LASSO method (glmnet()), I selected candidate sets of features, which were fed into the logistic regressions, random forests, decision trees, and multilayer perceptrons in order to find an optimal model with the best AUC.
As a result, they didn’t significantly improve the baseline AUC (76%).
# Make a new data frame
df_numFeatures_squared <- (as.data.frame(df_numFeatures_sc_imputed))^2
colnames0 <- paste0(colnames(df_numFeatures_squared), "_squared")
colnames(df_numFeatures_squared) <- colnames0
df_numFeatures_squared <- as_tibble(df_numFeatures_squared)
dat_train_sc2 <- bind_cols(df_y, df_numFeatures_sc_imputed, df_numFeatures_squared, df_categorical)
dat_train_sc2_train <- dat_train_sc2[-val_set_ids, ] # for model training
dat_train_sc2_val <- dat_train_sc2[val_set_ids, ] # for model evaluation and
# tuning hyper parameters
# Set features and model formulas
features2 <- setdiff(colnames(dat_train_sc2), "y")
features2
formula2 <- as.formula(paste0('y ~ ', paste0(features2, collapse = ' + ')))
formula2
# Fit glm()
lr_full2 <- glm(formula2, data = dat_train_sc2_train, family = binomial(link = 'logit'))
summary(lr_full2)
indSig <- (coef(summary(lr_full2))[, 4] < 0.05)
(sigFeatures <- rownames(coef(summary(lr_full2)))[indSig])
# Significant features shown (P<0.05)
# Prediction performance
pred <- predict(lr_full2, newdata = dat_train_sc2_val, type = 'response')
(lr_full2$AUC <- auc(dat_train_sc2_val$y, pred))
## Area under the curve: 0.8044
out <- caretConfusionMat_F1_Acc((dat_train_sc_val$y == 1), pred)
out$conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 8348 1153
## TRUE 199 300
##
## Accuracy : 0.8648
## 95% CI : (0.8579, 0.8714)
## No Information Rate : 0.8547
## P-Value [Acc > NIR] : 0.00201
##
## Kappa : 0.2518
##
## Mcnemar's Test P-Value : < 2e-16
##
## Precision : 0.6012
## Recall : 0.2065
## F1 : 0.3074
## Prevalence : 0.1453
## Detection Rate : 0.0300
## Detection Prevalence : 0.0499
## Balanced Accuracy : 0.5916
##
## 'Positive' Class : TRUE
##
(lr_full2$Accuracy <- out$Accuracy)
## Accuracy
## 0.8648
dat_train_sc2_train %>% ggplot(aes(x=x4, y=y)) +
geom_hex() +
scale_fill_gradient(name = 'Count(log10)', trans = "log10") +
geom_smooth() +
theme(text = element_text(size = 20))
dat_train_sc2_train %>% ggplot(aes(x=x87, y=y)) +
geom_hex() +
scale_fill_gradient(name = 'Count(log10)', trans = "log10") +
geom_smooth() +
theme(text = element_text(size = 20))
Using this LASSO method, I will select candidate sets of features, which will be fed into the logistic regressions, random forests, decision trees, and multilayer perceptrons in order to find an optimal model with the best AUC.
Parse categorical features into indicator features and create a design matrix for the regression
X <- model.matrix(formula2, data = dat_train_sc2)[, -1]
y <- dat_train_sc2$y
lr_LASSO <- glmnet(X, y, family = 'binomial', standardize = FALSE)
lr_LASSO
##
## Call: glmnet(x = X, y = y, family = "binomial", standardize = FALSE)
##
## Df %Dev Lambda
## 1 0 0.00 0.080800
## 2 2 1.42 0.073620
## 3 2 2.91 0.067080
## 4 2 4.18 0.061120
## 5 2 5.28 0.055690
## 6 2 6.23 0.050740
## 7 2 7.05 0.046240
## 8 3 7.90 0.042130
## 9 3 8.81 0.038390
## 10 4 9.78 0.034980
## 11 4 10.62 0.031870
## 12 4 11.34 0.029040
## 13 4 11.96 0.026460
## 14 4 12.48 0.024110
## 15 4 12.93 0.021970
## 16 4 13.32 0.020010
## 17 5 13.64 0.018240
## 18 6 13.94 0.016620
## 19 10 14.25 0.015140
## 20 12 14.58 0.013800
## 21 13 14.87 0.012570
## 22 13 15.17 0.011450
## 23 16 15.43 0.010440
## 24 17 15.78 0.009509
## 25 17 16.11 0.008664
## 26 19 16.41 0.007894
## 27 19 16.70 0.007193
## 28 20 16.96 0.006554
## 29 24 17.19 0.005972
## 30 25 17.39 0.005441
## 31 26 17.58 0.004958
## 32 28 17.77 0.004517
## 33 31 17.94 0.004116
## 34 34 18.09 0.003750
## 35 38 18.23 0.003417
## 36 40 18.35 0.003114
## 37 47 18.47 0.002837
## 38 51 18.59 0.002585
## 39 58 18.70 0.002355
## 40 71 18.85 0.002146
## 41 78 19.01 0.001955
## 42 88 19.16 0.001782
## 43 94 19.30 0.001623
## 44 103 19.45 0.001479
## 45 108 19.61 0.001348
## 46 113 19.76 0.001228
## 47 114 19.90 0.001119
## 48 120 20.02 0.001020
## 49 130 20.15 0.000929
## 50 133 20.27 0.000847
## 51 141 20.37 0.000771
## 52 151 20.49 0.000703
## 53 153 20.61 0.000640
## 54 154 20.71 0.000583
## 55 160 20.80 0.000532
## 56 171 20.88 0.000484
## 57 174 20.95 0.000441
## 58 178 21.03 0.000402
## 59 182 21.10 0.000366
## 60 186 21.16 0.000334
## 61 192 21.22 0.000304
## 62 198 21.27 0.000277
## 63 200 21.32 0.000253
## 64 205 21.36 0.000230
## 65 204 21.39 0.000210
## 66 209 21.42 0.000191
## 67 210 21.45 0.000174
## 68 213 21.47 0.000159
## 69 214 21.49 0.000144
## 70 215 21.51 0.000132
## 71 218 21.52 0.000120
## 72 223 21.53 0.000109
## 73 225 21.55 0.000100
## 74 227 21.56 0.000091
## 75 228 21.56 0.000083
## 76 229 21.57 0.000075
## 77 230 21.58 0.000069
## 78 233 21.58 0.000063
## 79 233 21.59 0.000057
## 80 235 21.59 0.000052
## 81 237 21.59 0.000047
## 82 238 21.60 0.000043
## 83 238 21.60 0.000039
## 84 238 21.60 0.000036
## 85 238 21.60 0.000033
## 86 242 21.60 0.000030
## 87 243 21.60 0.000027
## 88 245 21.61 0.000025
plot(lr_LASSO, label = TRUE)
set.seed(0325)
cv.fit <- cv.glmnet(X, y, nfolds = 4)
plot(cv.fit)
log(cv.fit$lambda.min)
## [1] -7.074425
# [1] -7.074425
log(cv.fit$lambda.1se)
## [1] -5.306784
# [1] -5.306784
Recommend sparsity with lambdas is between (lambda.min, lambda.1se).
The features given by lambda.1se is typically considered as the most regularized model.
To consider 4 candidate subsets of features, I choose the following feature sets which are in an increasing order of size.
Therefore, by including the full model, total 5 feature sets will be considered across various prediction models seeking for the optimal model with the best AUC.
# Prepare variables
lambdas <- list()
coeffs <- list()
sigGlmFeatures <- list()
sigOrigFeatures <- list()
# Lambdas were chosen from the output of the above cross-validation using glmnet()
lambdas[[1]] <- cv.fit$lambda.1se
lambdas[[2]] <- exp( (log(cv.fit$lambda.1se) + log(cv.fit$lambda.min))/2 )
lambdas[[3]] <- cv.fit$lambda.min
lambdas[[4]] <- exp(-8.5)
# Lambdas are matched to the corresponding features with non-zero coefficients
for (k in 1:4){
coeffs[[k]] <- coef(lr_LASSO, s = lambdas[[k]])
sigGlmFeatures[[k]] <- coeffs[[k]]@Dimnames[[1]][coeffs[[k]]@i + 1][-1]
print(paste0('#### k = ', k))
print(sigGlmFeatures[[k]])
}
# Parsed binary indicators are pooled back to original categorical features
(sigOrigFeatures[[1]] <- c(sigGlmFeatures[[1]][1:23], 'x3', 'x31', 'x93'))
(sigOrigFeatures[[2]] <- c(sigGlmFeatures[[2]][1:70], 'x3', 'x31', 'x33', 'x93'))
(sigOrigFeatures[[3]] <-
c(sigGlmFeatures[[3]][1:112], 'x3', 'x31', 'x33', 'x60', 'x65', 'x93', 'x98'))
(sigOrigFeatures[[4]] <-
c(sigGlmFeatures[[4]][1:148], 'x3', 'x31', 'x33', 'x60', 'x65', 'x77', 'x93', 'x98'))
(sigOrigFeatures[[5]] <- c(colnames(dat_train_sc2)[-1]))
# The number of glm features (with parsed binaries) across all 5 candidate features,
# including the full model
c(as.numeric(lapply(sigGlmFeatures, length)), ncol(X))
# [1] 26 78 133 209 255
# The number of original features across all 5 candidate features, including
# the full model
c(as.numeric(lapply(sigOrigFeatures, length)))
# [1] 26 74 119 156 179
# Save
save(lambdas, coeffs, sigGlmFeatures, sigOrigFeatures,
file = 'glmLASSO_selectedFeatSets_includeSquared.rdata')
mdl_formulas <- list()
lr_models <- list()
lr_AUC <- list()
lr_Accuracy <- list()
# Specify model formulas
for (k in 1:length(sigOrigFeatures)){
mdl_formulas[[k]] <- as.formula(
paste0('y ~ ', paste0(sigOrigFeatures[[k]], collapse = ' + ')))
}
# Fit glm() and compute prediction measures for validation data
for (k in 1:length(sigOrigFeatures)){
lr_models[[k]] <- glm(mdl_formulas[[k]], data = dat_train_sc2_train,
family = binomial(link = 'logit'))
print(summary(lr_models[[k]]))
pred <- predict(lr_models[[k]], newdata = dat_train_sc2_val, type = 'response')
(lr_models[[k]]$AUC <- auc(dat_train_sc2_val$y, pred))
cat(paste0("\n## k = ", k, ", Prediction AUC: ", round(lr_models[[k]]$AUC, 4)), "\n")
out <- caretConfusionMat_F1_Acc((dat_train_sc2_val$y == 1), pred)
print(out$conf)
(lr_models[[k]]$Accuracy <- out$Accuracy)
cat(paste0("\n## k = ", k, ", Prediction Accuracy: ",
round(lr_models[[k]]$Accuracy, 4)), "\n")
lr_AUC[[k]] <- lr_models[[k]]$AUC
lr_Accuracy[[k]] <- lr_models[[k]]$Accuracy
}
print(lr_AUC) # (max AUC) k = 2, 0.8093
## [[1]]
## Area under the curve: 0.7986
##
## [[2]]
## Area under the curve: 0.8093
##
## [[3]]
## Area under the curve: 0.8076
##
## [[4]]
## Area under the curve: 0.8054
##
## [[5]]
## Area under the curve: 0.8044
print(lr_Accuracy) # (max Accuracy) k = 2, 0.8685
## [[1]]
## Accuracy
## 0.8631
##
## [[2]]
## Accuracy
## 0.8685
##
## [[3]]
## Accuracy
## 0.8671
##
## [[4]]
## Accuracy
## 0.8649
##
## [[5]]
## Accuracy
## 0.8648
print(length(sigOrigFeatures[[2]]))
print(sigOrigFeatures[[2]])
# Among 5 considered feature sets, the following set is selected by GLM.
#[1] "x11" "x14" "x16" "x26" "x41"
#[6] "x42" "x52" "x54" "x61" "x67"
#[11] "x68" "x75" "x83" "x89" "x96"
#[16] "x1" "x7" "x9" "x15" "x18"
#[21] "x19" "x28" "x36" "x40" "x46"
#[26] "x47" "x48" "x51" "x56" "x62"
#[31] "x66" "x81" "x97" "x100" "x5_squared"
#[36] "x16_squared" "x22_squared" "x38_squared" "x45_squared" "x55_squared"
#[41] "x63_squared" "x64_squared" "x74_squared" "x75_squared" "x80_squared"
#[46] "x83_squared" "x85_squared" "x89_squared" "x94_squared" "x4_squared"
#[51] "x8_squared" "x18_squared" "x19_squared" "x21_squared" "x32_squared"
#[56] "x34_squared" "x37_squared" "x40_squared" "x43_squared" "x46_squared"
#[61] "x47_squared" "x50_squared" "x53_squared" "x69_squared" "x71_squared"
#[66] "x72_squared" "x73_squared" "x82_squared" "x84_squared" "x100_squared"
#[71] "x3" "x31" "x33" "x93"
# Save
save(lr_models, lr_AUC, lr_Accuracy,
file = "lr_models_AUC_Acc.rdata")
rf_models <- list()
mdl_formulas_wDummy <- list()
rf_Accuracy <- list()
# Because of an error in parsed model formula, 'x33' (state) variable's factor levels
# are re-coded by removing a space in the state names.
dat_train_sc2_x33recoded <- dat_train_sc2
x33 <- dat_train_sc2_x33recoded$x33
level0 <- levels(x33)
level1 <- level0
for (i in 1:length(level0)){
level1[i] <- paste0(unlist(strsplit(level0[i], split=" ")), collapse="")
}
levels(x33) <- level1
dat_train_sc2_x33recoded$x33 <- x33
# Generate a new df's where categorical features are parsed into binary features
X_x33recoded <- model.matrix(formula2, data = dat_train_sc2_x33recoded)[, -1]
dat_train_sc2_dummy <- as_tibble(cbind(y = dat_train_sc2_x33recoded$y, X_x33recoded))
dat_train_sc2_dummy_train <- dat_train_sc2_dummy[-val_set_ids, ]
dat_train_sc2_dummy_val <- dat_train_sc2_dummy[val_set_ids, ]
sc <- spark_connect(master = 'local')
dat_train_sc2_dummy_train_tbl <-
copy_to(sc, dat_train_sc2_dummy_train, 'dat_train_sc2_dummy_train_tbl', overwrite = T)
dat_train_sc2_dummy_val_tbl <-
copy_to(sc, dat_train_sc2_dummy_val, 'dat_train_sc2_dummy_val_tbl', overwrite = T)
# Generate new model formulas according to the new df's with the recoded x33
for (k in 1:length(mdl_formulas)){
tmpMdlMat <- model.matrix(mdl_formulas[[k]], data = dat_train_sc2_x33recoded)[, -1]
mdl_formulas_wDummy[[k]] <-
as.formula(paste0('y ~ ', paste0(cbind(colnames(tmpMdlMat)), collapse = ' + ')))
}
# Fit ml_random_forest() and compute prediction measures for validation data
for (k in 1:length(mdl_formulas_wDummy)){
rf_models[[k]] <- ml_random_forest(
dat_train_sc2_dummy_train_tbl, mdl_formulas_wDummy[[k]], type = "classification")
print(rf_models[[k]])
mlEval <- ml_evaluate(rf_models[[k]], dat_train_sc2_dummy_val_tbl) # Val Accuracy
rf_Accuracy[[k]] <- mlEval
}
print(rf_Accuracy)
## [[1]]
## # A tibble: 1 x 1
## Accuracy
## <dbl>
## 1 0.788
##
## [[2]]
## # A tibble: 1 x 1
## Accuracy
## <dbl>
## 1 0.788
##
## [[3]]
## # A tibble: 1 x 1
## Accuracy
## <dbl>
## 1 0.788
##
## [[4]]
## # A tibble: 1 x 1
## Accuracy
## <dbl>
## 1 0.788
##
## [[5]]
## # A tibble: 1 x 1
## Accuracy
## <dbl>
## 1 0.788
dt_models <- list()
dt_Accuracy <- list()
# Fit ml_decision_tree() and compute prediction measures for validation data
for (k in 1:length(mdl_formulas_wDummy)){
dt_models[[k]] <- ml_decision_tree(dat_train_sc2_dummy_train_tbl, mdl_formulas_wDummy[[k]],
type = "classification")
print(dt_models[[k]])
mlEval <- ml_evaluate(dt_models[[k]], dat_train_sc2_dummy_val_tbl) # Val Accuracy
dt_Accuracy[[k]] <- mlEval
}
print(dt_Accuracy)
## [[1]]
## # A tibble: 1 x 1
## Accuracy
## <dbl>
## 1 0.808
##
## [[2]]
## # A tibble: 1 x 1
## Accuracy
## <dbl>
## 1 0.809
##
## [[3]]
## # A tibble: 1 x 1
## Accuracy
## <dbl>
## 1 0.809
##
## [[4]]
## # A tibble: 1 x 1
## Accuracy
## <dbl>
## 1 0.800
##
## [[5]]
## # A tibble: 1 x 1
## Accuracy
## <dbl>
## 1 0.800
For model specifications, I chose to run two MLP models.
The first has 1-hidden layer with the same number of nodes as the input layer.
The second has 2-hidden layers where the layer size is 3*(the number of input layer), which can hopefully approximate non-linear relations between X and y if there is any.
Through preliminary fitting with MLPs, I selected a set of tuning parameters (learning rate, momentum, epochs, batch_size, and dropout rate).
Their values are specified in defined functions of kerasMlp1() and kerasMlp2().
mlp1_models <- list()
mlp1_AUC <- list()
mlp1_Accuracy <- list()
mlp2_models <- list()
mlp2_AUC <- list()
mlp2_Accuracy <- list()
for (k in 1:length(mdl_formulas)){
x_train <- model.matrix(mdl_formulas[[k]], data = dat_train_sc2_train)[, -1]
y_train <- dat_train_sc2_train$y
mlp1_models[[k]] <- kerasMlp1(x_train, y_train)
# Validation data
x_val <- model.matrix(mdl_formulas[[k]], data = dat_train_sc2_val)[, -1]
y_val <- dat_train_sc2_val$y
pred <- mlp1_models[[k]]$model %>% predict(x_val)
(mlp1_AUC[[k]] <- auc(y_val, c(pred)))
cat(paste0("\n## k = ", k, ", Prediction AUC: ", round(mlp1_AUC[[k]], 4)), "\n")
out <- caretConfusionMat_F1_Acc((dat_train_sc2_val$y == 1), c(pred))
print(out$conf)
mlp1_Accuracy[[k]] <- out$Accuracy
cat(paste0("\n## k = ", k, ", Prediction Accuracy: ",
round(mlp1_Accuracy[[k]], 4)), "\n")
}
# Prediction measures
print(mlp1_AUC)
## [[1]]
## Area under the curve: 0.7952
##
## [[2]]
## Area under the curve: 0.8015
##
## [[3]]
## Area under the curve: 0.7941
##
## [[4]]
## Area under the curve: 0.7901
##
## [[5]]
## Area under the curve: 0.7889
print(mlp1_Accuracy)
## [[1]]
## Accuracy
## 0.8642
##
## [[2]]
## Accuracy
## 0.8654
##
## [[3]]
## Accuracy
## 0.8616
##
## [[4]]
## Accuracy
## 0.8615
##
## [[5]]
## Accuracy
## 0.8573
for (k in 1:length(mdl_formulas)){
x_train <- model.matrix(mdl_formulas[[k]], data = dat_train_sc2_train)[, -1]
y_train <- dat_train_sc2_train$y
mlp2_models[[k]] <- kerasMlp2(x_train, y_train)
# Validation data
x_val <- model.matrix(mdl_formulas[[k]], data = dat_train_sc2_val)[, -1]
y_val <- dat_train_sc2_val$y
pred <- mlp2_models[[k]]$model %>% predict(x_val)
(mlp2_AUC[[k]] <- auc(y_val, c(pred)))
cat(paste0("\n## k = ", k, ", Prediction AUC: ", round(mlp2_AUC[[k]], 4)), "\n")
out <- caretConfusionMat_F1_Acc((dat_train_sc2_val$y == 1), c(pred))
print(out$conf)
mlp2_Accuracy[[k]] <- out$Accuracy
cat(paste0("\n## k = ", k, ", Prediction Accuracy: ",
round(mlp2_Accuracy[[k]], 4)), "\n")
}
# Prediction measures
print(mlp2_AUC)
## [[1]]
## Area under the curve: 0.7953
##
## [[2]]
## Area under the curve: 0.799
##
## [[3]]
## Area under the curve: 0.787
##
## [[4]]
## Area under the curve: 0.7908
##
## [[5]]
## Area under the curve: 0.7888
print(mlp2_Accuracy)
## [[1]]
## Accuracy
## 0.8645
##
## [[2]]
## Accuracy
## 0.8629
##
## [[3]]
## Accuracy
## 0.8604
##
## [[4]]
## Accuracy
## 0.8644
##
## [[5]]
## Accuracy
## 0.862
# Save
save(mlp1_models, mlp1_AUC, mlp1_Accuracy, mlp2_models, mlp2_AUC, mlp2_Accuracy,
file = "MLP_models_AUC_Acc.rdata")
save_model_hdf5(mlp1_models[[2]]$model, 'mlp1_model_k2.h5')
save_model_hdf5(mlp2_models[[2]]$model, 'mlp2_model_k2.h5')
As an optimal MLP model, I select the 1-hidden layer model with k=2 (mlp1_models[[2]]).
Although a 2-hidden layer model showed a better AUC (0.7994), the
difference from the AUC of mlp1_models[[2]] (0.7978) is
negligible.
Thus, a more parsimonious model is preferred.
Just for record, the optimal MLP model (mlp1_models with k=2 feature set) is defined and trained as follows.
k = 2
x_train <- model.matrix(mdl_formulas[[k]], data = dat_train_sc2_train)[, -1]
y_train <- dat_train_sc2_train$y
mlp1_models[[k]] <- kerasMlp1(x_train, y_train)
## Model: "sequential_10"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_26 (Dense) (None, 129) 16770
## dropout_15 (Dropout) (None, 129) 0
## dense_25 (Dense) (None, 1) 130
## ================================================================================
## Total params: 16,900
## Trainable params: 16,900
## Non-trainable params: 0
## ________________________________________________________________________________
## 69.27 sec elapsed
# Validation data
x_val <- model.matrix(mdl_formulas[[k]], data = dat_train_sc2_val)[, -1]
y_val <- dat_train_sc2_val$y
pred <- mlp1_models[[k]]$model %>% predict(x_val)
(mlp1_AUC[[k]] <- auc(y_val, c(pred)))
## Area under the curve: 0.7972
cat(paste0("\n## k = ", k, ", Prediction AUC: ", round(mlp1_AUC[[k]], 4)), "\n")
##
## ## k = 2, Prediction AUC: 0.7972
out <- caretConfusionMat_F1_Acc((dat_train_sc2_val$y == 1), c(pred))
print(out$conf)
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 8410 1216
## TRUE 137 237
##
## Accuracy : 0.8647
## 95% CI : (0.8578, 0.8713)
## No Information Rate : 0.8547
## P-Value [Acc > NIR] : 0.002202
##
## Kappa : 0.2126
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Precision : 0.6337
## Recall : 0.1631
## F1 : 0.2594
## Prevalence : 0.1453
## Detection Rate : 0.0237
## Detection Prevalence : 0.0374
## Balanced Accuracy : 0.5735
##
## 'Positive' Class : TRUE
##
mlp1_Accuracy[[k]] <- out$Accuracy
cat(paste0("\n## k = ", k, ", Prediction Accuracy: ",
round(mlp1_Accuracy[[k]], 4)), "\n")
##
## ## k = 2, Prediction Accuracy: 0.8647