Load data

# load the Pima Indians dataset from the mlbench dataset
library(mlbench)
data(PimaIndiansDiabetes) 
# rename dataset to have shorter name because lazy
diabetes <- PimaIndiansDiabetes

data.set <- diabetes
  # datatable(data.set[sample(nrow(data.set),
  #                         replace = FALSE,
  #                         size = 0.005 * nrow(data.set)), ])
summary(data.set)
##     pregnant         glucose         pressure         triceps     
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     insulin           mass          pedigree           age        diabetes 
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780   Min.   :21.00   neg:500  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437   1st Qu.:24.00   pos:268  
##  Median : 30.5   Median :32.00   Median :0.3725   Median :29.00            
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719   Mean   :33.24            
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00            
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200   Max.   :81.00

Process data and variable

data.set$diabetes <- as.numeric(data.set$diabetes)
data.set$diabetes=data.set$diabetes-1
head(data.set$diabetes)
## [1] 1 0 1 0 1 0
head(data.set)
##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1        6     148       72      35       0 33.6    0.627  50        1
## 2        1      85       66      29       0 26.6    0.351  31        0
## 3        8     183       64       0       0 23.3    0.672  32        1
## 4        1      89       66      23      94 28.1    0.167  21        0
## 5        0     137       40      35     168 43.1    2.288  33        1
## 6        5     116       74       0       0 25.6    0.201  30        0
str(data.set)
## 'data.frame':    768 obs. of  9 variables:
##  $ pregnant: num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ pressure: num  72 66 64 66 40 74 50 0 70 96 ...
##  $ triceps : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ insulin : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ mass    : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ pedigree: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes: num  1 0 1 0 1 0 1 0 1 1 ...
  • transform dataframe into matrix
# Cast dataframe as a matrix
data.set <- as.matrix(data.set)

# Remove column names
dimnames(data.set) = NULL
head(data.set)
##      [,1] [,2] [,3] [,4] [,5] [,6]  [,7] [,8] [,9]
## [1,]    6  148   72   35    0 33.6 0.627   50    1
## [2,]    1   85   66   29    0 26.6 0.351   31    0
## [3,]    8  183   64    0    0 23.3 0.672   32    1
## [4,]    1   89   66   23   94 28.1 0.167   21    0
## [5,]    0  137   40   35  168 43.1 2.288   33    1
## [6,]    5  116   74    0    0 25.6 0.201   30    0

Split data into training and test datasets

  • including xtrain ytrian xtest ytest
# Split for train and test data
set.seed(100)
indx <- sample(2,
               nrow(data.set),
               replace = TRUE,
               prob = c(0.8, 0.2)) # Makes index with values 1 and 2
# Select only the feature variables
# Take rows with index = 1
x_train <- data.set[indx == 1, 1:8]
x_test <- data.set[indx == 2, 1:8]
# Feature Scaling
x_train <- scale(x_train )
x_test <- scale(x_test )
y_test_actual <- data.set[indx == 2, 9]
  • transform target as on-hot-coding format
# Using similar indices to correspond to the training and test set
y_train <- to_categorical(data.set[indx == 1, 9])
## Loaded Tensorflow version 2.8.0
y_test <- to_categorical(data.set[indx == 2, 9])
head(y_train)
##      [,1] [,2]
## [1,]    0    1
## [2,]    1    0
## [3,]    0    1
## [4,]    1    0
## [5,]    0    1
## [6,]    1    0
head(data.set[indx == 1, 9],20)
##  [1] 1 0 1 0 1 0 0 1 1 0 0 1 1 1 1 1 0 1 0 0
  • dimension of four splitting data sets
dim(x_train)
## [1] 609   8
dim(y_train)
## [1] 609   2
dim(x_test)
## [1] 159   8
dim(y_test)
## [1] 159   2

Creating neural network model

construction of model

  • the output layer contains 3 levels
# Creating the model
model <- keras_model_sequential()

model %>% 
  layer_dense(name = "DeepLayer1",
              units = 10,
              activation = "relu",
              input_shape = c(8)) %>% 
  # input 4 features
  layer_dense(name = "DeepLayer2",
              units = 10,
              activation = "relu") %>% 
  
  layer_dense(name = "OutputLayer",
              units = 2,
              activation = "softmax")
  # output 4 categories using one-hot-coding
summary(model)
## Model: "sequential"
## ________________________________________________________________________________
##  Layer (type)                       Output Shape                    Param #     
## ================================================================================
##  DeepLayer1 (Dense)                 (None, 10)                      90          
##  DeepLayer2 (Dense)                 (None, 10)                      110         
##  OutputLayer (Dense)                (None, 2)                       22          
## ================================================================================
## Total params: 222
## Trainable params: 222
## Non-trainable params: 0
## ________________________________________________________________________________

Compiling the model

# Compiling the model
model %>% compile(loss = "categorical_crossentropy",
                  optimizer = "adam",
                  metrics = c("accuracy"))

Fitting the data and plot

history <- model %>% 
  fit(x_train,
      y_train,
      
      # adjusting number of epoch
      epoch = 60,
      
      # adjusting number of batch size
      batch_size = 64,
      validation_split = 0.15,
      verbose = 2)
plot(history)

Evaluation

Output loss and accuracy

using xtest and ytest data sets to evaluate the built model directly

model %>% 
  evaluate(x_test,
           y_test)
##      loss  accuracy 
## 0.4579953 0.7610063

Output the predicted classes and confusion matrix

pred <- model %>% 
  predict(x_test) %>% k_argmax() %>% k_get_value()
head(pred)
## [1] 0 1 0 1 0 0
table(Predicted = pred,
      Actual = y_test_actual)
##          Actual
## Predicted  0  1
##         0 89 23
##         1 15 32

Output the predicted values

prob <- model %>% 
  predict(x_test) %>% k_get_value()
head(prob)
##           [,1]       [,2]
## [1,] 0.9557184 0.04428164
## [2,] 0.1685439 0.83145601
## [3,] 0.9652161 0.03478396
## [4,] 0.3754042 0.62459576
## [5,] 0.7928934 0.20710661
## [6,] 0.7832032 0.21679683

Comparison between prob, pred, and ytest

comparison <- cbind(prob ,
      pred ,
      y_test_actual )
head(comparison)
##                           pred y_test_actual
## [1,] 0.9557184 0.04428164    0             1
## [2,] 0.1685439 0.83145601    1             1
## [3,] 0.9652161 0.03478396    0             0
## [4,] 0.3754042 0.62459576    1             1
## [5,] 0.7928934 0.20710661    0             0
## [6,] 0.7832032 0.21679683    0             0