## Homework 8: Neural Network Classification
Analysis
# load data
data(PimaIndiansDiabetes2)
# inspect the data
str(PimaIndiansDiabetes2)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 NA 70 96 ...
## $ triceps : num 35 29 NA 23 35 NA 32 NA 45 NA ...
## $ insulin : num NA NA NA 94 168 NA 88 NA 543 NA ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 NA ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
summary(PimaIndiansDiabetes2)
## pregnant glucose pressure triceps
## Min. : 0.000 Min. : 44.0 Min. : 24.00 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.00 1st Qu.:22.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :29.00
## Mean : 3.845 Mean :121.7 Mean : 72.41 Mean :29.15
## 3rd Qu.: 6.000 3rd Qu.:141.0 3rd Qu.: 80.00 3rd Qu.:36.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## NA's :5 NA's :35 NA's :227
## insulin mass pedigree age diabetes
## Min. : 14.00 Min. :18.20 Min. :0.0780 Min. :21.00 neg:500
## 1st Qu.: 76.25 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00 pos:268
## Median :125.00 Median :32.30 Median :0.3725 Median :29.00
## Mean :155.55 Mean :32.46 Mean :0.4719 Mean :33.24
## 3rd Qu.:190.00 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.00 Max. :67.10 Max. :2.4200 Max. :81.00
## NA's :374 NA's :11
# check for missing data
colSums(is.na(PimaIndiansDiabetes2))
## pregnant glucose pressure triceps insulin mass pedigree age
## 0 5 35 227 374 11 0 0
## diabetes
## 0
# remove rows with NA values
PimaIndiansDiabetes2 <- na.omit(PimaIndiansDiabetes2)
# normalize the features
data_norm <- as.data.frame(scale(PimaIndiansDiabetes2[, -ncol(PimaIndiansDiabetes2)]))
data_norm$diabetes <- PimaIndiansDiabetes2$diabetes
# convert the target variable to a numeric factor
data_norm$diabetes <- as.factor(data_norm$diabetes)
# split the data into training and testing sets
set.seed(123)
train_indices <- sample(1:nrow(data_norm), size = 0.8 * nrow(data_norm))
data_train <- data_norm[train_indices, ]
data_test <- data_norm[-train_indices, ]
# build the neural network model
nn <- neuralnet(diabetes ~ pregnant + glucose + pressure + triceps + insulin + mass + pedigree + age,
data = data_train,
hidden = c(4, 2),
linear.output = FALSE)
# plot the neural network
plot(nn)
# use the model to make predictions
predictions <- predict(nn, data_test[, 1:8])
# convert the predictions to class labels
predicted_classes <- apply(predictions, 1, which.max)
# compare with the actual classes
actual_classes <- as.integer(data_test$diabetes)
# calculate accuracy
accuracy <- sum(predicted_classes == actual_classes) / length(actual_classes)
cat('Test accuracy:', accuracy, '\n')
## Test accuracy: 0.7594937