One-Class Learning with SELU Autoencoder for Predicting One-Year Survival of Thoracic Surgery Patients

This document presents a one-class learning method using an autoencoder with scaled exponential linear units (SELU) activation applied to predicting one-year survival of thoracic surgery patients. The learning methods consist of:

Autoencoder with Kernel Density Estimation Model
Gaussian Mixture Model on Deep Autoencoder.

1. Import and Preprocess

1.1. Import Packages

library(keras)      # package for deep learning
library(readr)      # package for reading file
library(dplyr)      # package for preprocessing
library(purrr)      # package for preprocessing
library(knitr)      # package for report generation
library(kableExtra) # package for report generation
library(caret)      # package for machine learning technique
library(yardstick)  # package for measuring model performance
library(stringr)    # package for splitting string
library(imbalance)  # package for oversampling
library(ROSE)       # package for oversampling
library(pROC)       # package for ROC
library(AppliedPredictiveModeling)
library(GGally)
library(ggplot2)
library(rgl)
library(RColorBrewer)
library(foreign)
library(ks)
library(mclust)

1.2. Read Data

dat <- read.arff(url("http://archive.ics.uci.edu/ml/machine-learning-databases/00277/ThoraricSurgery.arff"))
dim(dat)

## [1] 470  17

colnames(dat) <- c("Diag", "FVC", "FEV1", "Zubrod", "Pain", "Hemoptysis", "Dyspnoea", "Cough", "Weakness", 
                   "TumorSize", "DiabetesMellitus", "MI6Month", "PAD", "Smoking", "Asthma", "Age", "Risk1Yr")

head(dat, 5)

## number of unique values for each variable
data.frame(t(sapply(dat, function(x) length(unique(x)))))

## number of NAs for each variable
nasum <- sapply(dat, function(x) sum(is.na(x)))
nasum

##             Diag              FVC             FEV1           Zubrod 
##                0                0                0                0 
##             Pain       Hemoptysis         Dyspnoea            Cough 
##                0                0                0                0 
##         Weakness        TumorSize DiabetesMellitus         MI6Month 
##                0                0                0                0 
##              PAD          Smoking           Asthma              Age 
##                0                0                0                0 
##          Risk1Yr 
##                0

[1] "Diag": Diagnosis; specific combination of ICD-10 codes for primary and secondary as well multiple tumors if any - (DGN3,DGN2,DGN4,DGN6,DGN5,DGN8,DGN1)

[2] "FVC": Forced vital capacity - FVC (numeric)

[3] "FEV1": Volume that has been exhaled at the end of the first second of forced expiration - FEV1 (numeric)

[4] "Zubrod": Performance status - Zubrod scale (PRZ2,PRZ1,PRZ0)

[5] "Pain": Pain before surgery (T,F)

[6] "Hemoptysis": Haemoptysis before surgery (T,F)

[7] "Dyspnoea": Dyspnoea before surgery (T,F)

[8] "Cough": Cough before surgery (T,F)

[9] "Weakness": Weakness before surgery (T,F)

[10] "TumorSize": T in clinical TNM - size of the original tumour, from OC11 (smallest) to OC14 (largest) (OC11,OC14,OC12,OC13)

[11] "DiabetesMellitus": Type 2 DM - diabetes mellitus (T,F)

[12] "MI6Month": MI up to 6 months (T,F)

[13] "PAD": PAD - peripheral arterial diseases (T,F)

[14] "Smoking": Smoking (T,F)

[15] "Asthma": Asthma (T,F)

[16] "Age": Age at surgery (numeric)

[17] "Risk1Yr": 1 year survival period - (T)rue value if died (T,F)

1.3. Preprocessing

X_dat <- dat %>% select(-c(Risk1Yr)) %>% as.data.frame()
y_dat <- dat$Risk1Yr %>% as.numeric() - 1

X_dat2 <- predict(dummyVars(~., data = X_dat, fullRank = TRUE), X_dat) %>% as.data.frame()

dim(X_dat2)

## [1] 470  24

head(X_dat2, 5)

table(y_dat)

## y_dat
##   0   1 
## 400  70

## Split the data into train, valid, and test.
X_train_list <- list()
X_valid_list <- list()
X_test_list <- list()
y_train_list <- list()
y_valid_list <- list()
y_test_list <- list()
for(seedind in 1:10) {
  set.seed(2019 + seedind)
  trainvalid_index <- createDataPartition(y_dat, p = 0.8, list = FALSE)
  
  X_trainvalid <- X_dat2[trainvalid_index,] %>% as.matrix()
  y_trainvalid <- y_dat[trainvalid_index]
  
  X_test <- X_dat2[-trainvalid_index,] %>% as.matrix()
  y_test <- y_dat[-trainvalid_index]
  
  train_index <- createDataPartition(y_trainvalid, p = 0.75, list = FALSE)
  
  X_train <- X_trainvalid[train_index,]
  y_train <- y_trainvalid[train_index]
  
  X_valid <- X_trainvalid[-train_index,]
  y_valid <- y_trainvalid[-train_index]
  
  ## Center and scale continuous variables.
  
  prepro <- preProcess(X_train, c("range", "zv"))
  X_train2 <- predict(prepro, X_train)
  X_valid2 <- predict(prepro, X_valid)
  X_test2 <- predict(prepro, X_test)
  
  numevar <- c("FVC", "FEV1", "Age")
  catevar <- colnames(X_train2)[!(colnames(X_train2) %in% numevar)]
  
  X_train_list[[seedind]] <- X_train2
  X_valid_list[[seedind]] <- X_valid2
  X_test_list[[seedind]]  <- X_test2
  y_train_list[[seedind]] <- y_train
  y_valid_list[[seedind]] <- y_valid
  y_test_list[[seedind]]  <- y_test
}

1.4. Distributions and Scatter Plots of Variables

ggpairs(X_dat, aes(color = as.factor(y_dat)), 
        lower = list(combo = wrap("box_no_facet"), discrete = wrap("ratio")),
        diag = list(continuous = wrap("densityDiag", alpha = 0.5)),
        upper = NULL)

2.1. Autoencoder with Kernel Density Estimation Model

len_layer2 <- floor(1 + sqrt(ncol(X_train2)))
len_layer1 <- floor(1 + sqrt(len_layer2))

library(keras)
use_session_with_seed(2019)

## Set session seed to 2019 (disabled GPU, CPU parallelism)

input_nume <- layer_input(c(dim(X_train2)[2]))

pred_nume <- input_nume %>% 
  layer_dense(len_layer2) %>%
  layer_activation("selu") %>%
  layer_dense(len_layer1) %>%
  layer_activation("selu") %>%
  layer_dense(len_layer2) %>%
  layer_activation("selu") %>%
  layer_dense(dim(X_train2)[2]) %>%
  layer_activation("sigmoid")

model_nume <- keras_model(input_nume, pred_nume)

summary(model_nume)

## ___________________________________________________________________________
## Layer (type)                     Output Shape                  Param #     
## ===========================================================================
## input_1 (InputLayer)             (None, 24)                    0           
## ___________________________________________________________________________
## dense_1 (Dense)                  (None, 5)                     125         
## ___________________________________________________________________________
## activation_1 (Activation)        (None, 5)                     0           
## ___________________________________________________________________________
## dense_2 (Dense)                  (None, 3)                     18          
## ___________________________________________________________________________
## activation_2 (Activation)        (None, 3)                     0           
## ___________________________________________________________________________
## dense_3 (Dense)                  (None, 5)                     20          
## ___________________________________________________________________________
## activation_3 (Activation)        (None, 5)                     0           
## ___________________________________________________________________________
## dense_4 (Dense)                  (None, 24)                    144         
## ___________________________________________________________________________
## activation_4 (Activation)        (None, 24)                    0           
## ===========================================================================
## Total params: 307
## Trainable params: 307
## Non-trainable params: 0
## ___________________________________________________________________________

model_nume %>% compile(
  optimizer = optimizer_adam(0.001),
  loss = "mean_squared_error"
  # loss = "mean_absolute_error"
  # loss = "logcosh"
)

history_nume <- model_nume %>% fit(X_train2[y_train==0,], X_train2[y_train==0,],
                                   batch_size = 4096,
                                   validation_data = list(X_valid2[y_valid==0,], X_valid2[y_valid==0,]),
                                   epochs = 10000,
                                   verbose = 0, 
                                   callbacks = list(callback_early_stopping(monitor = "val_loss", patience = 2000, restore_best_weights = TRUE),
                                                    callback_reduce_lr_on_plateau(monitor = "val_loss", factor = 0.1, patience = 500)))

print(history_nume)

## Trained on 234 samples, validated on 85 samples (batch_size=4,096, epochs=4,292)
## Final epoch (plot to see history):
## val_loss: 0.04296
##     loss: 0.03407
##       lr: 0.0000001

# plot(history_nume)

layer_weight <- keras::get_weights(model_nume)

pred_nume2 <- input_nume %>% 
  layer_dense(len_layer2) %>%
  layer_activation("selu") %>%
  layer_dense(len_layer1)

model_nume2 <- keras_model(input_nume, pred_nume2)
model_nume2 %>% set_weights(layer_weight)

ae_hidden_train <- predict(model_nume2, X_train2)

p1 <- ggpairs(data.frame(ae_hidden_train), mapping = aes(color = as.factor(y_train)),
              lower = list(continuous = wrap("points", alpha = 0.7)),
              diag = list(continuous = wrap("densityDiag", alpha = 0.5)),
              upper = list(continuous = wrap("cor", size = 3)), title = "Hidden layer of Training set")
# Correlation matrix plot
p2 <- ggcorr(data.frame(ae_hidden_train), label = TRUE, label_round = 2)
g2 <- ggplotGrob(p2)
colors <- g2$grobs[[6]]$children[[3]]$gp$fill
# Change background color to tiles in the upper triangular matrix of plots
idx <- 1
p <- ncol(ae_hidden_train)
for (k1 in 1:(p-1)) {
  for (k2 in (k1+1):p) {
    plt <- getPlot(p1,k1,k2) +
      theme(panel.background = element_rect(fill = colors[idx], color="white"),
            panel.grid.major = element_line(color = colors[idx]))
    p1 <- putPlot(p1,plt,k1,k2)
    idx <- idx+1
  }
}
p1

ae_hidden_test <- predict(model_nume2, X_test2)

p1 <- ggpairs(data.frame(ae_hidden_test), mapping = aes(color = as.factor(y_test)),
              lower = list(continuous = wrap("points", alpha = 0.7)),
              diag = list(continuous = wrap("densityDiag", alpha = 0.5)),
              upper = list(continuous = wrap("cor", size = 3)), title = "Hidden layer of Test set")
# Correlation matrix plot
p2 <- ggcorr(data.frame(ae_hidden_test), label = TRUE, label_round = 2)
g2 <- ggplotGrob(p2)
colors <- g2$grobs[[6]]$children[[3]]$gp$fill
# Change background color to tiles in the upper triangular matrix of plots
idx <- 1
p <- ncol(ae_hidden_test)
for (k1 in 1:(p-1)) {
  for (k2 in (k1+1):p) {
    plt <- getPlot(p1,k1,k2) +
      theme(panel.background = element_rect(fill = colors[idx], color="white"),
            panel.grid.major = element_line(color = colors[idx]))
    p1 <- putPlot(p1,plt,k1,k2)
    idx <- idx+1
  }
}
p1

len_layer2 <- floor(1 + sqrt(ncol(X_train2)))
len_layer1 <- floor(1 + sqrt(len_layer2))

library(keras)
use_session_with_seed(2019)

## Set session seed to 2019 (disabled GPU, CPU parallelism)

input_nume <- layer_input(c(dim(X_train2)[2]))

pred_nume <- input_nume %>% 
  layer_dense(len_layer2) %>%
  layer_activation("selu") %>%
  layer_dense(len_layer1, kernel_regularizer = regularizer_l2(0.01)) %>%
  layer_activation("selu") %>%
  layer_dense(len_layer2) %>%
  layer_activation("selu") %>%
  layer_dense(dim(X_train2)[2]) %>%
  layer_activation("sigmoid")

model_nume <- keras_model(input_nume, pred_nume)

summary(model_nume)

## ___________________________________________________________________________
## Layer (type)                     Output Shape                  Param #     
## ===========================================================================
## input_1 (InputLayer)             (None, 24)                    0           
## ___________________________________________________________________________
## dense_1 (Dense)                  (None, 5)                     125         
## ___________________________________________________________________________
## activation_1 (Activation)        (None, 5)                     0           
## ___________________________________________________________________________
## dense_2 (Dense)                  (None, 3)                     18          
## ___________________________________________________________________________
## activation_2 (Activation)        (None, 3)                     0           
## ___________________________________________________________________________
## dense_3 (Dense)                  (None, 5)                     20          
## ___________________________________________________________________________
## activation_3 (Activation)        (None, 5)                     0           
## ___________________________________________________________________________
## dense_4 (Dense)                  (None, 24)                    144         
## ___________________________________________________________________________
## activation_4 (Activation)        (None, 24)                    0           
## ===========================================================================
## Total params: 307
## Trainable params: 307
## Non-trainable params: 0
## ___________________________________________________________________________

model_nume %>% compile(
  optimizer = optimizer_adam(0.001),
  loss = "mean_squared_error"
  # loss = "mean_absolute_error"
  # loss = "logcosh"
)

history_nume <- model_nume %>% fit(X_train2[y_train==0,], X_train2[y_train==0,],
                                   batch_size = 4096,
                                   validation_data = list(X_valid2[y_valid==0,], X_valid2[y_valid==0,]),
                                   epochs = 10000,
                                   verbose = 0, 
                                   callbacks = list(callback_early_stopping(monitor = "val_loss", patience = 2000, restore_best_weights = TRUE),
                                                    callback_reduce_lr_on_plateau(monitor = "val_loss", factor = 0.1, patience = 500)))

print(history_nume)

## Trained on 234 samples, validated on 85 samples (batch_size=4,096, epochs=8,576)
## Final epoch (plot to see history):
## val_loss: 0.03396
##     loss: 0.02701
##       lr: 0.0000001

# plot(history_nume)

layer_weight <- keras::get_weights(model_nume)

pred_nume2 <- input_nume %>% 
  layer_dense(len_layer2) %>%
  layer_activation("selu") %>%
  layer_dense(len_layer1)

model_nume2 <- keras_model(input_nume, pred_nume2)
model_nume2 %>% set_weights(layer_weight)

ae_hidden_train <- predict(model_nume2, X_train2)

p1 <- ggpairs(data.frame(ae_hidden_train), mapping = aes(color = as.factor(y_train)),
              lower = list(continuous = wrap("points", alpha = 0.7)),
              diag = list(continuous = wrap("densityDiag", alpha = 0.5)),
              upper = list(continuous = wrap("cor", size = 3)), title = "Hidden layer of Training set")
# Correlation matrix plot
p2 <- ggcorr(data.frame(ae_hidden_train), label = TRUE, label_round = 2)
g2 <- ggplotGrob(p2)
colors <- g2$grobs[[6]]$children[[3]]$gp$fill
# Change background color to tiles in the upper triangular matrix of plots
idx <- 1
p <- ncol(ae_hidden_train)
for (k1 in 1:(p-1)) {
  for (k2 in (k1+1):p) {
    plt <- getPlot(p1,k1,k2) +
      theme(panel.background = element_rect(fill = colors[idx], color="white"),
            panel.grid.major = element_line(color = colors[idx]))
    p1 <- putPlot(p1,plt,k1,k2)
    idx <- idx+1
  }
}
p1

ae_hidden_test <- predict(model_nume2, X_test2)

p1 <- ggpairs(data.frame(ae_hidden_test), mapping = aes(color = as.factor(y_test)),
              lower = list(continuous = wrap("points", alpha = 0.7)),
              diag = list(continuous = wrap("densityDiag", alpha = 0.5)),
              upper = list(continuous = wrap("cor", size = 3)), title = "Hidden layer of Test set")
# Correlation matrix plot
p2 <- ggcorr(data.frame(ae_hidden_test), label = TRUE, label_round = 2)
g2 <- ggplotGrob(p2)
colors <- g2$grobs[[6]]$children[[3]]$gp$fill
# Change background color to tiles in the upper triangular matrix of plots
idx <- 1
p <- ncol(ae_hidden_test)
for (k1 in 1:(p-1)) {
  for (k2 in (k1+1):p) {
    plt <- getPlot(p1,k1,k2) +
      theme(panel.background = element_rect(fill = colors[idx], color="white"),
            panel.grid.major = element_line(color = colors[idx]))
    p1 <- putPlot(p1,plt,k1,k2)
    idx <- idx+1
  }
}
p1

ae_hidden_train_normal <- predict(model_nume2, X_train2[y_train==0,])
kde_train <- kde(ae_hidden_train_normal)

# ae_hidden_test[predict(kde_train, x = ae_hidden_test) < kde_train$cont["5%"],]

pred_test <- ifelse(predict(kde_train, x = ae_hidden_test) < kde_train$cont["5%"], 1, 0)
specificity_test <- sum(pred_test == 0 & y_test == 0)/(length(y_test)-sum(y_test))
sensitivity_test <- sum(pred_test == 1 & y_test == 1)/sum(y_test)
BACC_test <- 1/2*(sum(pred_test == 0 & y_test == 0)/(length(y_test)-sum(y_test)) + sum(pred_test == 1 & y_test == 1)/sum(y_test))

results <- data.frame(network = paste(dim(X_train2)[2], len_layer2, len_layer1, len_layer2, dim(X_train2)[2], sep = "-"), 
                      # threshold = threshold,
                      test_sens = sensitivity_test, 
                      test_spec = specificity_test, 
                      test_bacc = BACC_test)
results

Can you distinguish the Risk1Yr in the hidden layer?

2.1.1. Kernel Density Estimation on Three Hidden Layers

sensitivity_test <- 0
specificity_test <- 0
BACC_test <- 0
for(seedind in 1:10) {
  X_train2 <- X_train_list[[seedind]]
  X_valid2 <- X_valid_list[[seedind]]
  X_test2 <- X_test_list[[seedind]]
  y_train <- y_train_list[[seedind]]
  y_valid <- y_valid_list[[seedind]]
  y_test <- y_test_list[[seedind]]

  len_layer2 <- floor(1 + sqrt(ncol(X_train2)))
  len_layer1 <- floor(1 + sqrt(len_layer2))
  
  library(keras)
  use_session_with_seed(2019)
  
  input_nume <- layer_input(c(dim(X_train2)[2]))
  
  pred_nume <- input_nume %>% 
    layer_dense(len_layer2) %>%
    layer_activation("selu") %>%
    layer_dense(len_layer1) %>%
    layer_activation("selu") %>%
    layer_dense(len_layer2) %>%
    layer_activation("selu") %>%
    layer_dense(dim(X_train2)[2]) %>%
    layer_activation("sigmoid")
  
  model_nume <- keras_model(input_nume, pred_nume)
  
  # summary(model_nume)
  
  model_nume %>% compile(
    optimizer = optimizer_adam(0.001),
    loss = "mean_squared_error"
    # loss = "mean_absolute_error"
    # loss = "logcosh"
  )
  
  history_nume <- model_nume %>% fit(X_train2[y_train==0,], X_train2[y_train==0,],
                                     batch_size = 4096,
                                     validation_data = list(X_valid2[y_valid==0,], X_valid2[y_valid==0,]),
                                     epochs = 10000,
                                     verbose = 0, 
                                     callbacks = list(callback_early_stopping(monitor = "val_loss", patience = 2000, restore_best_weights = TRUE),
                                                      callback_reduce_lr_on_plateau(monitor = "val_loss", factor = 0.1, patience = 500)))
  
  # print(history_nume)
  # plot(history_nume)
  
  layer_weight <- keras::get_weights(model_nume)
  
  pred_nume2 <- input_nume %>% 
    layer_dense(len_layer2) %>%
    layer_activation("selu") %>%
    layer_dense(len_layer1)
  
  model_nume2 <- keras_model(input_nume, pred_nume2)
  model_nume2 %>% set_weights(layer_weight)
  
  ae_hidden_train_normal <- predict(model_nume2, X_train2[y_train==0,])
  kde_train <- kde(ae_hidden_train_normal)
  ae_hidden_test <- predict(model_nume2, X_test2)
  
  pred_test <- ifelse(predict(kde_train, x = ae_hidden_test) < kde_train$cont["5%"], 1, 0)
  
  specificity_test <- specificity_test + sum(pred_test == 0 & y_test == 0)/(length(y_test)-sum(y_test))
  
  sensitivity_test <- sensitivity_test + sum(pred_test == 1 & y_test == 1)/sum(y_test)
  
  BACC_test <- BACC_test + 1/2*(sum(pred_test == 0 & y_test == 0)/(length(y_test)-sum(y_test)) + sum(pred_test == 1 & y_test == 1)/sum(y_test))
    
}

## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)

results <- data.frame(network = paste(dim(X_train2)[2], len_layer2, len_layer1, len_layer2, dim(X_train2)[2], sep = "-"), 
                          # threshold = threshold,
                          test_sens = sensitivity_test / 10, 
                          test_spec = specificity_test / 10, 
                          test_bacc = BACC_test / 10)
results

2.1.2. Kernel Density Estimation on Three Hidden Layers with L2 Regularization (0.01)

sensitivity_test <- 0
specificity_test <- 0
BACC_test <- 0
for(seedind in 1:10) {
  X_train2 <- X_train_list[[seedind]]
  X_valid2 <- X_valid_list[[seedind]]
  X_test2 <- X_test_list[[seedind]]
  y_train <- y_train_list[[seedind]]
  y_valid <- y_valid_list[[seedind]]
  y_test <- y_test_list[[seedind]]

  len_layer2 <- floor(1 + sqrt(ncol(X_train2)))
  len_layer1 <- floor(1 + sqrt(len_layer2))
  
  library(keras)
  use_session_with_seed(2019)
  
  input_nume <- layer_input(c(dim(X_train2)[2]))
  
  pred_nume <- input_nume %>% 
    layer_dense(len_layer2) %>%
    layer_activation("selu") %>%
    layer_dense(len_layer1, kernel_regularizer = regularizer_l2(0.01)) %>%
    layer_activation("selu") %>%
    layer_dense(len_layer2) %>%
    layer_activation("selu") %>%
    layer_dense(dim(X_train2)[2]) %>%
    layer_activation("sigmoid")
  
  model_nume <- keras_model(input_nume, pred_nume)
  
  # summary(model_nume)
  
  model_nume %>% compile(
    optimizer = optimizer_adam(0.001),
    loss = "mean_squared_error"
    # loss = "mean_absolute_error"
    # loss = "logcosh"
  )
  
  history_nume <- model_nume %>% fit(X_train2[y_train==0,], X_train2[y_train==0,],
                                     batch_size = 4096,
                                     validation_data = list(X_valid2[y_valid==0,], X_valid2[y_valid==0,]),
                                     epochs = 10000,
                                     verbose = 0, 
                                     callbacks = list(callback_early_stopping(monitor = "val_loss", patience = 2000, restore_best_weights = TRUE),
                                                      callback_reduce_lr_on_plateau(monitor = "val_loss", factor = 0.1, patience = 500)))
  
  # print(history_nume)
  # plot(history_nume)
  
  layer_weight <- keras::get_weights(model_nume)
  
  pred_nume2 <- input_nume %>% 
    layer_dense(len_layer2) %>%
    layer_activation("selu") %>%
    layer_dense(len_layer1, kernel_regularizer = regularizer_l2(0.01))
  
  model_nume2 <- keras_model(input_nume, pred_nume2)
  model_nume2 %>% set_weights(layer_weight)
  
  ae_hidden_train_normal <- predict(model_nume2, X_train2[y_train==0,])
  kde_train <- kde(ae_hidden_train_normal)
  ae_hidden_test <- predict(model_nume2, X_test2)
  
  pred_test <- ifelse(predict(kde_train, x = ae_hidden_test) < kde_train$cont["5%"], 1, 0)
  
  specificity_test <- specificity_test + sum(pred_test == 0 & y_test == 0)/(length(y_test)-sum(y_test))
  
  sensitivity_test <- sensitivity_test + sum(pred_test == 1 & y_test == 1)/sum(y_test)
  
  BACC_test <- BACC_test + 1/2*(sum(pred_test == 0 & y_test == 0)/(length(y_test)-sum(y_test)) + sum(pred_test == 1 & y_test == 1)/sum(y_test))
    
}

## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)

results <- data.frame(network = paste(dim(X_train2)[2], len_layer2, len_layer1, len_layer2, dim(X_train2)[2], sep = "-"), 
                          # threshold = threshold,
                          test_sens = sensitivity_test / 10, 
                          test_spec = specificity_test / 10, 
                          test_bacc = BACC_test / 10)
results

2.2. Kernel Density Estimation on Deep Layers

sensitivity_test <- 0
specificity_test <- 0
BACC_test <- 0
for(seedind in 1:10) {
  X_train2 <- X_train_list[[seedind]]
  X_valid2 <- X_valid_list[[seedind]]
  X_test2 <- X_test_list[[seedind]]
  y_train <- y_train_list[[seedind]]
  y_valid <- y_valid_list[[seedind]]
  y_test <- y_test_list[[seedind]]

  len_layer3 <- floor(1 + ncol(X_train2)/2)
  len_layer2 <- floor(1 + len_layer3/2)
  len_layer1 <- floor(1 + len_layer2/2)
  
  library(keras)
  use_session_with_seed(2019)
  
  input_nume <- layer_input(c(dim(X_train2)[2]))
  
  pred_nume <- input_nume %>% 
    layer_dense(len_layer3) %>%
    layer_activation("selu") %>%
    layer_dense(len_layer2) %>%
    layer_activation("selu") %>%
    layer_dense(len_layer1, kernel_regularizer = regularizer_l2(0.01)) %>%
    layer_activation("selu") %>%
    layer_dense(len_layer2) %>%
    layer_activation("selu") %>%
    layer_dense(len_layer3) %>%
    layer_activation("selu") %>%
    layer_dense(dim(X_train2)[2]) %>%
    layer_activation("sigmoid")
  
  model_nume <- keras_model(input_nume, pred_nume)
  
  # summary(model_nume)
  
  model_nume %>% compile(
    optimizer = optimizer_adam(0.001),
    loss = "mean_squared_error"
    # loss = "mean_absolute_error"
    # loss = "logcosh"
  )
  
  history_nume <- model_nume %>% fit(X_train2[y_train==0,], X_train2[y_train==0,],
                                     batch_size = 4096,
                                     validation_data = list(X_valid2[y_valid==0,], X_valid2[y_valid==0,]),
                                     epochs = 10000,
                                     verbose = 0, 
                                     callbacks = list(callback_early_stopping(monitor = "val_loss", patience = 2000, restore_best_weights = TRUE),
                                                      callback_reduce_lr_on_plateau(monitor = "val_loss", factor = 0.1, patience = 500)))
  
  # print(history_nume)
  # plot(history_nume)
  
  layer_weight <- keras::get_weights(model_nume)
  
  pred_nume2 <- input_nume %>% 
    layer_dense(len_layer3) %>%
    layer_activation("selu") %>%
    layer_dense(len_layer2) %>%
    layer_activation("selu") %>%
    layer_dense(len_layer1, kernel_regularizer = regularizer_l2(0.01))
  
  model_nume2 <- keras_model(input_nume, pred_nume2)
  model_nume2 %>% set_weights(layer_weight)
  
  ae_hidden_train_normal <- predict(model_nume2, X_train2[y_train==0,])
  kde_train <- kde(ae_hidden_train_normal)
  ae_hidden_test <- predict(model_nume2, X_test2)
  
  pred_test <- ifelse(predict(kde_train, x = ae_hidden_test) < kde_train$cont["5%"], 1, 0)
  
  specificity_test <- specificity_test + sum(pred_test == 0 & y_test == 0)/(length(y_test)-sum(y_test))
  
  sensitivity_test <- sensitivity_test + sum(pred_test == 1 & y_test == 1)/sum(y_test)
  
  BACC_test <- BACC_test + 1/2*(sum(pred_test == 0 & y_test == 0)/(length(y_test)-sum(y_test)) + sum(pred_test == 1 & y_test == 1)/sum(y_test))
    
}

## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)

results <- data.frame(network = paste(dim(X_train2)[2], len_layer3, len_layer2, len_layer1, len_layer2, len_layer3, dim(X_train2)[2], sep = "-"), 
                          # threshold = threshold,
                          test_sens = sensitivity_test / 10, 
                          test_spec = specificity_test / 10, 
                          test_bacc = BACC_test / 10)
results

ae_hidden_train <- predict(model_nume2, X_train2)

p1 <- ggpairs(data.frame(ae_hidden_train), mapping = aes(color = as.factor(y_train)),
              lower = list(continuous = wrap("points", alpha = 0.7)),
              diag = list(continuous = wrap("densityDiag", alpha = 0.5)),
              upper = list(continuous = wrap("cor", size = 3)), title = "Hidden layer of Training set")
# Correlation matrix plot
p2 <- ggcorr(data.frame(ae_hidden_train), label = TRUE, label_round = 2)
g2 <- ggplotGrob(p2)
colors <- g2$grobs[[6]]$children[[3]]$gp$fill
# Change background color to tiles in the upper triangular matrix of plots
idx <- 1
p <- ncol(ae_hidden_train)
for (k1 in 1:(p-1)) {
  for (k2 in (k1+1):p) {
    plt <- getPlot(p1,k1,k2) +
      theme(panel.background = element_rect(fill = colors[idx], color="white"),
            panel.grid.major = element_line(color = colors[idx]))
    p1 <- putPlot(p1,plt,k1,k2)
    idx <- idx+1
  }
}
p1

ae_hidden_test <- predict(model_nume2, X_test2)

p1 <- ggpairs(data.frame(ae_hidden_test), mapping = aes(color = as.factor(y_test)),
              lower = list(continuous = wrap("points", alpha = 0.7)),
              diag = list(continuous = wrap("densityDiag", alpha = 0.5)),
              upper = list(continuous = wrap("cor", size = 3)), title = "Hidden layer of Test set")
# Correlation matrix plot
p2 <- ggcorr(data.frame(ae_hidden_test), label = TRUE, label_round = 2)
g2 <- ggplotGrob(p2)
colors <- g2$grobs[[6]]$children[[3]]$gp$fill
# Change background color to tiles in the upper triangular matrix of plots
idx <- 1
p <- ncol(ae_hidden_test)
for (k1 in 1:(p-1)) {
  for (k2 in (k1+1):p) {
    plt <- getPlot(p1,k1,k2) +
      theme(panel.background = element_rect(fill = colors[idx], color="white"),
            panel.grid.major = element_line(color = colors[idx]))
    p1 <- putPlot(p1,plt,k1,k2)
    idx <- idx+1
  }
}
p1

3. Gaussian Mixture Model on Deep Autoencoder

sensitivity_test <- 0
specificity_test <- 0
BACC_test <- 0
for(seedind in 1:10) {
  X_train2 <- X_train_list[[seedind]]
  X_valid2 <- X_valid_list[[seedind]]
  X_test2 <- X_test_list[[seedind]]
  y_train <- y_train_list[[seedind]]
  y_valid <- y_valid_list[[seedind]]
  y_test <- y_test_list[[seedind]]

  len_layer3 <- floor(1 + ncol(X_train2)/2)
  len_layer2 <- floor(1 + len_layer3/2)
  len_layer1 <- floor(1 + sqrt(len_layer2))
  
  library(keras)
  use_session_with_seed(2019)
  
  input_nume <- layer_input(c(dim(X_train2)[2]))
  
  pred_nume <- input_nume %>% 
    layer_dense(len_layer3) %>%
    layer_activation("selu") %>%
    layer_dense(len_layer2) %>%
    layer_activation("selu") %>%
    layer_dense(len_layer1, kernel_regularizer = regularizer_l2(0.01)) %>%
    layer_activation("selu") %>%
    layer_dense(len_layer2) %>%
    layer_activation("selu") %>%
    layer_dense(len_layer3) %>%
    layer_activation("selu") %>%
    layer_dense(dim(X_train2)[2]) %>%
    layer_activation("sigmoid")
  
  model_nume <- keras_model(input_nume, pred_nume)
  
  # summary(model_nume)
  
  model_nume %>% compile(
    optimizer = optimizer_adam(0.001),
    loss = "mean_squared_error"
    # loss = "mean_absolute_error"
    # loss = "logcosh"
  )
  
  history_nume <- model_nume %>% fit(X_train2[y_train == 0,], X_train2[y_train == 0,],
                                     batch_size = 4096,
                                     validation_data = list(X_valid2[y_valid == 0,], X_valid2[y_valid == 0,]),
                                     epochs = 10000,
                                     verbose = 0, 
                                     callbacks = list(callback_early_stopping(monitor = "val_loss", patience = 2000, restore_best_weights = TRUE),
                                                      callback_reduce_lr_on_plateau(monitor = "val_loss", factor = 0.1, patience = 500)))
  
  # print(history_nume)
  # plot(history_nume)
  
  layer_weight <- keras::get_weights(model_nume)
  
  pred_nume2 <- input_nume %>% 
    layer_dense(len_layer3) %>%
    layer_activation("selu") %>%
    layer_dense(len_layer2) %>%
    layer_activation("selu") %>%
    layer_dense(len_layer1)
  
  model_nume2 <- keras_model(input_nume, pred_nume2)
  model_nume2 %>% set_weights(layer_weight)
    
  ae_hidden_train_normal <- predict(model_nume2, X_train2[y_train == 0,])
  m_gmm <- Mclust(ae_hidden_train_normal, G = 1)
  # plot(m_gmm, what = "classification")
  
  gmm_mean <- c(m_gmm$parameters$mean)
  gmm_sigma <- m_gmm$parameters$variance$Sigma
  
  ae_hidden_test <- predict(model_nume2, X_test2)
  
  ae_prob <- apply(ae_hidden_test, 1, function(x) {
    q_gmm <- t(x - gmm_mean) %*% solve(gmm_sigma) %*% (x - gmm_mean)
    pchisq(q_gmm, df = length(gmm_mean), lower.tail = FALSE)
  })
  
  pred_test <- ifelse(ae_prob < 0.05, 1, 0)
  specificity_test <- specificity_test + sum(pred_test == 0 & y_test == 0)/(length(y_test)-sum(y_test))
  sensitivity_test <- sensitivity_test + sum(pred_test == 1 & y_test == 1)/sum(y_test)
  BACC_test <- BACC_test + 1/2*(sum(pred_test == 0 & y_test == 0)/(length(y_test)-sum(y_test)) + sum(pred_test == 1 & y_test == 1)/sum(y_test))
}

## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)
## Set session seed to 2019 (disabled GPU, CPU parallelism)

results <- data.frame(network = paste(dim(X_train2)[2], len_layer3, len_layer2, len_layer1, len_layer2, len_layer3, dim(X_train2)[2], sep = "-"), 
                          # threshold = threshold,
                          test_sens = sensitivity_test / 10, 
                          test_spec = specificity_test / 10, 
                          test_bacc = BACC_test / 10)
results

ae_hidden_train <- predict(model_nume2, X_train2)

p1 <- ggpairs(data.frame(ae_hidden_train), mapping = aes(color = as.factor(y_train)),
              lower = list(continuous = wrap("points", alpha = 0.7)),
              diag = list(continuous = wrap("densityDiag", alpha = 0.5)),
              upper = list(continuous = wrap("cor", size = 3)), title = "Hidden layer of Training set")
# Correlation matrix plot
p2 <- ggcorr(data.frame(ae_hidden_train), label = TRUE, label_round = 2)
g2 <- ggplotGrob(p2)
colors <- g2$grobs[[6]]$children[[3]]$gp$fill
# Change background color to tiles in the upper triangular matrix of plots
idx <- 1
p <- ncol(ae_hidden_train)
for (k1 in 1:(p-1)) {
  for (k2 in (k1+1):p) {
    plt <- getPlot(p1,k1,k2) +
      theme(panel.background = element_rect(fill = colors[idx], color="white"),
            panel.grid.major = element_line(color = colors[idx]))
    p1 <- putPlot(p1,plt,k1,k2)
    idx <- idx+1
  }
}
p1

ae_hidden_test <- predict(model_nume2, X_test2)

p1 <- ggpairs(data.frame(ae_hidden_test), mapping = aes(color = as.factor(y_test)),
              lower = list(continuous = wrap("points", alpha = 0.7)),
              diag = list(continuous = wrap("densityDiag", alpha = 0.5)),
              upper = list(continuous = wrap("cor", size = 3)), title = "Hidden layer of Test set")
# Correlation matrix plot
p2 <- ggcorr(data.frame(ae_hidden_test), label = TRUE, label_round = 2)
g2 <- ggplotGrob(p2)
colors <- g2$grobs[[6]]$children[[3]]$gp$fill
# Change background color to tiles in the upper triangular matrix of plots
idx <- 1
p <- ncol(ae_hidden_test)
for (k1 in 1:(p-1)) {
  for (k2 in (k1+1):p) {
    plt <- getPlot(p1,k1,k2) +
      theme(panel.background = element_rect(fill = colors[idx], color="white"),
            panel.grid.major = element_line(color = colors[idx]))
    p1 <- putPlot(p1,plt,k1,k2)
    idx <- idx+1
  }
}
p1