rm(list = ls())
#install.packages("keras")
library(keras)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   0.3.5
## ✔ tibble  3.2.1     ✔ dplyr   1.1.1
## ✔ tidyr   1.2.1     ✔ stringr 1.4.1
## ✔ readr   2.1.3     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(splitstackshape)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
#install.packages("caret")
#?install_keras() #Install TensorFlow and Keras, including all Python dependencies
#install_keras(tensorflow = "gpu")
packageVersion('keras') #[1] ‘2.11.1’
## [1] '2.11.1'
packageVersion('tensorflow') #[1] ‘2.11.0’
## [1] '2.11.0'
###############################input data 
dir_path <- "/Users/xut2/Desktop/keras/"
dir_path_name <- dir(dir_path,pattern = "*.csv",full.names = T,recursive = T)
#dir_path_name
red <- read.csv(grep("winequality-red.csv", dir_path_name, value = T),
                      check.names = F, header = T,stringsAsFactors = F, sep = ";")
#dim(red) #[1] 1599   12
#head(red)
red <- red %>% mutate(good = ifelse(quality >= 6, 1, 0))  #factor must be encoded as numeric
dim(red); table(red$good) # 0   1 744 855
## [1] 1599   13
## 
##   0   1 
## 744 855
head(red)
##   fixed acidity volatile acidity citric acid residual sugar chlorides
## 1           7.4             0.70        0.00            1.9     0.076
## 2           7.8             0.88        0.00            2.6     0.098
## 3           7.8             0.76        0.04            2.3     0.092
## 4          11.2             0.28        0.56            1.9     0.075
## 5           7.4             0.70        0.00            1.9     0.076
## 6           7.4             0.66        0.00            1.8     0.075
##   free sulfur dioxide total sulfur dioxide density   pH sulphates alcohol
## 1                  11                   34  0.9978 3.51      0.56     9.4
## 2                  25                   67  0.9968 3.20      0.68     9.8
## 3                  15                   54  0.9970 3.26      0.65     9.8
## 4                  17                   60  0.9980 3.16      0.58     9.8
## 5                  11                   34  0.9978 3.51      0.56     9.4
## 6                  13                   40  0.9978 3.51      0.56     9.4
##   quality good
## 1       5    0
## 2       5    0
## 3       5    0
## 4       6    1
## 5       5    0
## 6       5    0
colnames(red)[ncol(red)] <- "num"
red$quality <- NULL
red$num <- as.numeric(red$num)
set.seed(2023)
data_test_list <- row.names(stratified(red, "num", .3))
#data_test_list <- sample(unique(data$Mapping.ID),size = floor(length(unique(data$Mapping.ID))*0.3),replace = F)
# Randomly shuffle the data
#k <- 5  # 5 fold cross validation #check point
testdata <- red[row.names(red) %in% data_test_list,]
traindata <- red[!row.names(red) %in% data_test_list,]
testdata <- unique(testdata)
traindata <- unique(traindata)
str(traindata);str(testdata)
## 'data.frame':    940 obs. of  12 variables:
##  $ fixed acidity       : num  9.4 10.6 9.4 10.6 10.6 10.6 10.2 10.2 11.6 9.3 ...
##  $ volatile acidity    : num  0.685 0.28 0.3 0.36 0.36 0.44 0.67 0.645 0.32 0.39 ...
##  $ citric acid         : num  0.11 0.39 0.56 0.59 0.6 0.68 0.39 0.36 0.55 0.4 ...
##  $ residual sugar      : num  2.7 15.5 2.8 2.2 2.2 4.1 1.9 1.8 2.8 2.6 ...
##  $ chlorides           : num  0.077 0.069 0.08 0.152 0.152 0.114 0.054 0.053 0.081 0.073 ...
##  $ free sulfur dioxide : num  6 6 6 6 7 6 6 5 35 10 ...
##  $ total sulfur dioxide: num  31 23 17 18 18 24 17 14 67 26 ...
##  $ density             : num  0.998 1.003 0.996 0.999 0.999 ...
##  $ pH                  : num  3.19 3.12 3.15 3.04 3.04 3.06 3.17 3.17 3.32 3.34 ...
##  $ sulphates           : num  0.7 0.66 0.92 1.05 1.06 0.66 0.47 0.42 0.92 0.75 ...
##  $ alcohol             : num  10.1 9.2 11.7 9.4 9.4 13.4 10 10 10.8 10.2 ...
##  $ num                 : num  1 0 1 0 0 1 0 1 1 1 ...
## 'data.frame':    419 obs. of  12 variables:
##  $ fixed acidity       : num  7.4 7.8 7.8 11.2 7.4 7.9 7.3 7.8 7.5 6.7 ...
##  $ volatile acidity    : num  0.7 0.88 0.76 0.28 0.66 0.6 0.65 0.58 0.5 0.58 ...
##  $ citric acid         : num  0 0 0.04 0.56 0 0.06 0 0.02 0.36 0.08 ...
##  $ residual sugar      : num  1.9 2.6 2.3 1.9 1.8 1.6 1.2 2 6.1 1.8 ...
##  $ chlorides           : num  0.076 0.098 0.092 0.075 0.075 0.069 0.065 0.073 0.071 0.097 ...
##  $ free sulfur dioxide : num  11 25 15 17 13 15 15 9 17 15 ...
##  $ total sulfur dioxide: num  34 67 54 60 40 59 21 18 102 65 ...
##  $ density             : num  0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num  3.51 3.2 3.26 3.16 3.51 3.3 3.39 3.36 3.35 3.28 ...
##  $ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.46 0.47 0.57 0.8 0.54 ...
##  $ alcohol             : num  9.4 9.8 9.8 9.8 9.4 9.4 10 9.5 10.5 9.2 ...
##  $ num                 : num  0 0 0 1 0 0 1 1 0 0 ...
#head(traindata)
#unique(traindata$num)
#table(traindata$num); table(testdata$num) #0   1 461 311 
#dim(testdata);dim(traindata) [1] 525  34  [1] 793  34
sum(row.names(traindata) %in% row.names(testdata))
## [1] 0
#################################For classification tasks, keras requires a dummy variable for each category. 
#We get this using to_categorical, specifying that we have two different categories.
head(traindata)
##     fixed acidity volatile acidity citric acid residual sugar chlorides
## 480           9.4            0.685        0.11            2.7     0.077
## 481          10.6            0.280        0.39           15.5     0.069
## 482           9.4            0.300        0.56            2.8     0.080
## 483          10.6            0.360        0.59            2.2     0.152
## 484          10.6            0.360        0.60            2.2     0.152
## 485          10.6            0.440        0.68            4.1     0.114
##     free sulfur dioxide total sulfur dioxide density   pH sulphates alcohol num
## 480                   6                   31  0.9984 3.19      0.70    10.1   1
## 481                   6                   23  1.0026 3.12      0.66     9.2   0
## 482                   6                   17  0.9964 3.15      0.92    11.7   1
## 483                   6                   18  0.9986 3.04      1.05     9.4   0
## 484                   7                   18  0.9986 3.04      1.06     9.4   0
## 485                   6                   24  0.9970 3.06      0.66    13.4   1
traindata[, -ncol(traindata)] <- scale(traindata[, -ncol(traindata)])
testdata[, -ncol(testdata)] <- scale(testdata[, -ncol(testdata)])
################################Let’s define the neural network using the keras syntax.
#?keras_model_sequential()  #Keras Model composed of a linear stack of layers
#?keras_model()  #
##use three layers (layer_dense() function) that I put one after the other with piping. 
#I also use regularization (layer_dropout() function) to avoid overfitting.
model <- keras_model_sequential() 
model %>% 
  layer_dense(units = 64, activation = 'relu', input_shape = ncol(traindata)-1) %>% 
  layer_dropout(rate = 0.4) %>% 
  layer_dense(units = 64, activation = 'relu') %>%
  layer_dropout(rate = 0.3) %>%
  layer_dense(units = 64, activation = 'relu') %>% 
  layer_dropout(rate = 0.2) %>% 
  layer_dense(units = 1, activation = 'sigmoid')
################################Before fitting the model, we compile it specifying the type of loss function, 
#the optimizer and the metric used for evaluating performance.
model %>% compile(
  loss = 'binary_crossentropy',
  optimizer = "adam",
  metrics = c('AUC'))
summary(model)
## Model: "sequential"
## ________________________________________________________________________________
##  Layer (type)                       Output Shape                    Param #     
## ================================================================================
##  dense_3 (Dense)                    (None, 64)                      768         
##  dropout_2 (Dropout)                (None, 64)                      0           
##  dense_2 (Dense)                    (None, 64)                      4160        
##  dropout_1 (Dropout)                (None, 64)                      0           
##  dense_1 (Dense)                    (None, 64)                      4160        
##  dropout (Dropout)                  (None, 64)                      0           
##  dense (Dense)                      (None, 1)                       65          
## ================================================================================
## Total params: 9,153
## Trainable params: 9,153
## Non-trainable params: 0
## ________________________________________________________________________________
################################We use fit to estimate network parameters.
history <- model %>% fit(
  x = as.matrix(traindata[, -ncol(traindata)]),
  y = as.matrix(traindata[, ncol(traindata)]),
  epochs = 20, 
  batch_size = 32,
  validation_split = 0.2,
  verbose=0)
plot(history)

#############################Evaluate the model’s performance on the test data:
model %>% evaluate(as.matrix(traindata[, -ncol(traindata)]), 
                           as.matrix(traindata[, ncol(traindata)]), batch_size=32)
##      loss       auc 
## 0.4678146 0.8583627
model %>% evaluate(as.matrix(testdata[, -ncol(testdata)]), 
                           as.matrix(testdata[, ncol(testdata)]), batch_size=32) #0.7960
##      loss       auc 
## 0.5687892 0.7986152
############################
# Predict the test set probabilities
pred_prob <- predict(model, as.matrix(testdata[, -ncol(testdata)]), type = "prob")
class(pred_prob)
## [1] "matrix" "array"
# Compute ROC curve
roc_curve <- roc(testdata$num, pred_prob[,1], plot = TRUE, print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Evaluate model on test data and print AUC
auc <- auc(roc_curve)
print(auc)
## Area under the curve: 0.799
# Compute Youden's J statistic for each cutoff
youden_j <- roc_curve$sensitivities + roc_curve$specificities - 1
# Find cutoff that maximizes Youden's J statistic
optimal_cutoff <- roc_curve$thresholds[which.max(youden_j)]
# Print optimal cutoff
print(paste("Optimal cutoff:", optimal_cutoff))
## [1] "Optimal cutoff: 0.476998791098595"
# Predict classes based on optimal cutoff
pred_class <- ifelse(pred_prob >= optimal_cutoff, 1, 0)
head(pred_class)
##      [,1]
## [1,]    0
## [2,]    0
## [3,]    0
## [4,]    1
## [5,]    0
## [6,]    0
# Compute confusion matrix
conf_mat <- table(testdata$num, pred_class)
# Print confusion matrix
print(conf_mat)
##    pred_class
##       0   1
##   0 157  69
##   1  44 149
# Print first 10 predicted probabilities
print(head(pred_prob, n = 10))
##            [,1]
##  [1,] 0.3196415
##  [2,] 0.2729245
##  [3,] 0.3524578
##  [4,] 0.6058478
##  [5,] 0.3603406
##  [6,] 0.3167345
##  [7,] 0.4792583
##  [8,] 0.4211475
##  [9,] 0.5440419
## [10,] 0.3048264
library(ggplot2)
# Compute ROC curve
roc_curve <- roc(testdata$num, as.numeric(pred_prob), plot = TRUE, print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

cutoff_nb <- coords(roc_curve, "best", ret = "threshold")
cutoff_nb 
##   threshold
## 1 0.4769988
# Create data frame for plotting
roc_df <- data.frame(fpr = roc_curve$specificities, tpr = roc_curve$sensitivities)
# Compute AUC
auc_val <- auc
# Plot ROC curve and AUC
ggplot(roc_df, aes(x = fpr, y = tpr)) +
  geom_line() +
  geom_abline(intercept = 1, slope = 1, linetype = "dashed") +
  ggtitle(paste0("ROC Curve (AUC = ", round(auc_val, 2), ")")) +
  xlab("False Positive Rate") +
  ylab("True Positive Rate") +
  theme_bw() + scale_x_reverse()

k_clear_session() #remove memory
#REF https://rpubs.com/liam/DLWR
#https://blog.csdn.net/wyzwyzwyzo/article/details/107242986
#https://oliviergimenez.github.io/bin-image-classif/