## Homework 8: Neural Network Classification Analysis

Loading the Data

# load data
data(PimaIndiansDiabetes2)

# inspect the data
str(PimaIndiansDiabetes2)
## 'data.frame':    768 obs. of  9 variables:
##  $ pregnant: num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ pressure: num  72 66 64 66 40 74 50 NA 70 96 ...
##  $ triceps : num  35 29 NA 23 35 NA 32 NA 45 NA ...
##  $ insulin : num  NA NA NA 94 168 NA 88 NA 543 NA ...
##  $ mass    : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 NA ...
##  $ pedigree: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
summary(PimaIndiansDiabetes2)
##     pregnant         glucose         pressure         triceps     
##  Min.   : 0.000   Min.   : 44.0   Min.   : 24.00   Min.   : 7.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 64.00   1st Qu.:22.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :29.00  
##  Mean   : 3.845   Mean   :121.7   Mean   : 72.41   Mean   :29.15  
##  3rd Qu.: 6.000   3rd Qu.:141.0   3rd Qu.: 80.00   3rd Qu.:36.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##                   NA's   :5       NA's   :35       NA's   :227    
##     insulin            mass          pedigree           age        diabetes 
##  Min.   : 14.00   Min.   :18.20   Min.   :0.0780   Min.   :21.00   neg:500  
##  1st Qu.: 76.25   1st Qu.:27.50   1st Qu.:0.2437   1st Qu.:24.00   pos:268  
##  Median :125.00   Median :32.30   Median :0.3725   Median :29.00            
##  Mean   :155.55   Mean   :32.46   Mean   :0.4719   Mean   :33.24            
##  3rd Qu.:190.00   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00            
##  Max.   :846.00   Max.   :67.10   Max.   :2.4200   Max.   :81.00            
##  NA's   :374      NA's   :11

Data Preprocessing

# check for missing data
colSums(is.na(PimaIndiansDiabetes2))
## pregnant  glucose pressure  triceps  insulin     mass pedigree      age 
##        0        5       35      227      374       11        0        0 
## diabetes 
##        0
# remove rows with NA values
PimaIndiansDiabetes2 <- na.omit(PimaIndiansDiabetes2)

# normalize the features
data_norm <- as.data.frame(scale(PimaIndiansDiabetes2[, -ncol(PimaIndiansDiabetes2)]))
data_norm$diabetes <- PimaIndiansDiabetes2$diabetes

# convert the target variable to a numeric factor
data_norm$diabetes <- as.factor(data_norm$diabetes)

# split the data into training and testing sets
set.seed(123)
train_indices <- sample(1:nrow(data_norm), size = 0.8 * nrow(data_norm))
data_train <- data_norm[train_indices, ]
data_test <- data_norm[-train_indices, ]

Building the Neural Network Model

# build the neural network model
nn <- neuralnet(diabetes ~ pregnant + glucose + pressure + triceps + insulin + mass + pedigree + age,
                data = data_train,
                hidden = c(4, 2),
                linear.output = FALSE)

# plot the neural network
plot(nn)

Model Evaluation

# use the model to make predictions
predictions <- predict(nn, data_test[, 1:8])

# convert the predictions to class labels
predicted_classes <- apply(predictions, 1, which.max)

# compare with the actual classes
actual_classes <- as.integer(data_test$diabetes)

# calculate accuracy
accuracy <- sum(predicted_classes == actual_classes) / length(actual_classes)
cat('Test accuracy:', accuracy, '\n')
## Test accuracy: 0.7594937