# load the Pima Indians dataset from the mlbench dataset
library(mlbench)
data(PimaIndiansDiabetes)
# rename dataset to have shorter name because lazy
diabetes <- PimaIndiansDiabetes
data.set <- diabetes
# datatable(data.set[sample(nrow(data.set),
# replace = FALSE,
# size = 0.005 * nrow(data.set)), ])
summary(data.set)
## pregnant glucose pressure triceps
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## insulin mass pedigree age diabetes
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00 neg:500
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00 pos:268
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
data.set$diabetes <- as.numeric(data.set$diabetes)
data.set$diabetes=data.set$diabetes-1
head(data.set$diabetes)
## [1] 1 0 1 0 1 0
head(data.set)
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 6 148 72 35 0 33.6 0.627 50 1
## 2 1 85 66 29 0 26.6 0.351 31 0
## 3 8 183 64 0 0 23.3 0.672 32 1
## 4 1 89 66 23 94 28.1 0.167 21 0
## 5 0 137 40 35 168 43.1 2.288 33 1
## 6 5 116 74 0 0 25.6 0.201 30 0
str(data.set)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : num 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ diabetes: num 1 0 1 0 1 0 1 0 1 1 ...
# Cast dataframe as a matrix
data.set <- as.matrix(data.set)
# Remove column names
dimnames(data.set) = NULL
head(data.set)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,] 6 148 72 35 0 33.6 0.627 50 1
## [2,] 1 85 66 29 0 26.6 0.351 31 0
## [3,] 8 183 64 0 0 23.3 0.672 32 1
## [4,] 1 89 66 23 94 28.1 0.167 21 0
## [5,] 0 137 40 35 168 43.1 2.288 33 1
## [6,] 5 116 74 0 0 25.6 0.201 30 0
xtrain ytrian xtest ytest# Split for train and test data
set.seed(100)
indx <- sample(2,
nrow(data.set),
replace = TRUE,
prob = c(0.8, 0.2)) # Makes index with values 1 and 2
# Select only the feature variables
# Take rows with index = 1
x_train <- data.set[indx == 1, 1:8]
x_test <- data.set[indx == 2, 1:8]
# Feature Scaling
x_train <- scale(x_train )
x_test <- scale(x_test )
y_test_actual <- data.set[indx == 2, 9]
# Using similar indices to correspond to the training and test set
y_train <- to_categorical(data.set[indx == 1, 9])
## Loaded Tensorflow version 2.8.0
y_test <- to_categorical(data.set[indx == 2, 9])
head(y_train)
## [,1] [,2]
## [1,] 0 1
## [2,] 1 0
## [3,] 0 1
## [4,] 1 0
## [5,] 0 1
## [6,] 1 0
head(data.set[indx == 1, 9],20)
## [1] 1 0 1 0 1 0 0 1 1 0 0 1 1 1 1 1 0 1 0 0
dim(x_train)
## [1] 609 8
dim(y_train)
## [1] 609 2
dim(x_test)
## [1] 159 8
dim(y_test)
## [1] 159 2
# Creating the model
model <- keras_model_sequential()
model %>%
layer_dense(name = "DeepLayer1",
units = 10,
activation = "relu",
input_shape = c(8)) %>%
# input 4 features
layer_dense(name = "DeepLayer2",
units = 10,
activation = "relu") %>%
layer_dense(name = "OutputLayer",
units = 2,
activation = "softmax")
# output 4 categories using one-hot-coding
summary(model)
## Model: "sequential"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## DeepLayer1 (Dense) (None, 10) 90
## DeepLayer2 (Dense) (None, 10) 110
## OutputLayer (Dense) (None, 2) 22
## ================================================================================
## Total params: 222
## Trainable params: 222
## Non-trainable params: 0
## ________________________________________________________________________________
# Compiling the model
model %>% compile(loss = "categorical_crossentropy",
optimizer = "adam",
metrics = c("accuracy"))
history <- model %>%
fit(x_train,
y_train,
# adjusting number of epoch
epoch = 60,
# adjusting number of batch size
batch_size = 64,
validation_split = 0.15,
verbose = 2)
plot(history)
using xtest and ytest data sets to evaluate
the built model directly
model %>%
evaluate(x_test,
y_test)
## loss accuracy
## 0.4579953 0.7610063
pred <- model %>%
predict(x_test) %>% k_argmax() %>% k_get_value()
head(pred)
## [1] 0 1 0 1 0 0
table(Predicted = pred,
Actual = y_test_actual)
## Actual
## Predicted 0 1
## 0 89 23
## 1 15 32
prob <- model %>%
predict(x_test) %>% k_get_value()
head(prob)
## [,1] [,2]
## [1,] 0.9557184 0.04428164
## [2,] 0.1685439 0.83145601
## [3,] 0.9652161 0.03478396
## [4,] 0.3754042 0.62459576
## [5,] 0.7928934 0.20710661
## [6,] 0.7832032 0.21679683
prob, pred, and ytestcomparison <- cbind(prob ,
pred ,
y_test_actual )
head(comparison)
## pred y_test_actual
## [1,] 0.9557184 0.04428164 0 1
## [2,] 0.1685439 0.83145601 1 1
## [3,] 0.9652161 0.03478396 0 0
## [4,] 0.3754042 0.62459576 1 1
## [5,] 0.7928934 0.20710661 0 0
## [6,] 0.7832032 0.21679683 0 0