1. Load the dataset and appropriate packages
library("readxl")
## Warning: 程辑包'readxl'是用R版本4.1.2 来建造的
library("dplyr")
## Warning: 程辑包'dplyr'是用R版本4.1.2 来建造的
##
## 载入程辑包:'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("caret")
## Warning: 程辑包'caret'是用R版本4.1.2 来建造的
## 载入需要的程辑包:ggplot2
## Warning: 程辑包'ggplot2'是用R版本4.1.2 来建造的
## 载入需要的程辑包:lattice
df <- read_excel("labW9.xlsx")
2. Conduct data exploration and checking and cleaning if necessary
# Check df structure and there are 768rows and 9 columns
str(df)
## tibble [768 x 9] (S3: tbl_df/tbl/data.frame)
## $ Pregnancies : num [1:768] 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : num [1:768] 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : num [1:768] 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : num [1:768] 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : num [1:768] 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num [1:768] 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num [1:768] 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : num [1:768] 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : num [1:768] 1 0 1 0 1 0 1 0 1 1 ...
# Check df summary
summary(df)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
# Check df column names
colnames(df)
## [1] "Pregnancies" "Glucose"
## [3] "BloodPressure" "SkinThickness"
## [5] "Insulin" "BMI"
## [7] "DiabetesPedigreeFunction" "Age"
## [9] "Outcome"
# Check missing values
colSums(is.na(df))
## Pregnancies Glucose BloodPressure
## 0 0 0
## SkinThickness Insulin BMI
## 0 0 0
## DiabetesPedigreeFunction Age Outcome
## 0 0 0
# Change outcome to factor
df$Outcome <- as.factor(df$Outcome)
# Check zero values cause Zero is the abnormal value of Glucose, BloodPressure, SkinThickness, Insulin and BMI
colSums(df==0)
## Pregnancies Glucose BloodPressure
## 111 5 35
## SkinThickness Insulin BMI
## 227 374 11
## DiabetesPedigreeFunction Age Outcome
## 0 0 500
# Replace zero values of Glucose, BloodPressure, SkinThickness, Insulin and BMI with NA
df$Glucose[df$Glucose==0] <- NA
df$BloodPressure[df$BloodPressure==0] <- NA
df$SkinThickness[df$SkinThickness==0] <- NA
df$Insulin[df$Insulin==0] <- NA
df$BMI[df$BMI==0] <- NA
# Check the missing values again to see whether it is acceptable to delete all these rows
colSums(is.na(df))
## Pregnancies Glucose BloodPressure
## 0 5 35
## SkinThickness Insulin BMI
## 227 374 11
## DiabetesPedigreeFunction Age Outcome
## 0 0 0
# It is not suitable to remove all these rows, so replace NA of SkinThickness and Insulin with its average value
df[is.na(df$SkinThickness), "SkinThickness"] <- mean(df$SkinThickness, na.rm=TRUE)
df[is.na(df$Insulin), "Insulin"] <- mean(df$Insulin, na.rm=TRUE)
# Remove the remaining rows that contain NA
df <- na.omit(df)
3. Partition data 70/30
set.seed(100)
split <- 0.70
trainIndex <- createDataPartition(df$Outcome, p=split, list=FALSE)
data_train <- df[trainIndex, ]
data_test <- df[-trainIndex, ]
4. Check both training and test subsets
# Check training subset structure
str(data_train)
## tibble [508 x 9] (S3: tbl_df/tbl/data.frame)
## $ Pregnancies : num [1:508] 6 1 1 5 3 2 4 10 1 5 ...
## $ Glucose : num [1:508] 148 85 89 116 78 197 110 168 189 166 ...
## $ BloodPressure : num [1:508] 72 66 66 74 50 70 92 74 60 72 ...
## $ SkinThickness : num [1:508] 35 29 23 29.2 32 ...
## $ Insulin : num [1:508] 156 156 94 156 88 ...
## $ BMI : num [1:508] 33.6 26.6 28.1 25.6 31 30.5 37.6 38 30.1 25.8 ...
## $ DiabetesPedigreeFunction: num [1:508] 0.627 0.351 0.167 0.201 0.248 0.158 0.191 0.537 0.398 0.587 ...
## $ Age : num [1:508] 50 31 21 30 26 53 30 34 59 51 ...
## $ Outcome : Factor w/ 2 levels "0","1": 2 1 1 1 2 2 1 2 2 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:44] 8 10 16 50 61 76 79 82 146 173 ...
## ..- attr(*, "names")= chr [1:44] "8" "10" "16" "50" ...
# Check test subset structure
str(data_test)
## tibble [216 x 9] (S3: tbl_df/tbl/data.frame)
## $ Pregnancies : num [1:216] 8 0 10 7 8 1 13 3 2 3 ...
## $ Glucose : num [1:216] 183 137 139 107 99 97 145 158 90 180 ...
## $ BloodPressure : num [1:216] 64 40 80 74 84 66 82 76 68 64 ...
## $ SkinThickness : num [1:216] 29.2 35 29.2 29.2 29.2 ...
## $ Insulin : num [1:216] 156 168 156 156 156 ...
## $ BMI : num [1:216] 23.3 43.1 27.1 29.6 35.4 23.2 22.2 31.6 38.2 34 ...
## $ DiabetesPedigreeFunction: num [1:216] 0.672 2.288 1.441 0.254 0.388 ...
## $ Age : num [1:216] 32 33 57 31 50 22 57 28 27 26 ...
## $ Outcome : Factor w/ 2 levels "0","1": 2 2 1 2 1 1 1 2 2 1 ...
## - attr(*, "na.action")= 'omit' Named int [1:44] 8 10 16 50 61 76 79 82 146 173 ...
## ..- attr(*, "names")= chr [1:44] "8" "10" "16" "50" ...
5. Check for cross validation
# ten times cross validation
train_control <- trainControl(method = "cv", number = 10)
6. Train test data
model <- train(Outcome~., data = data_train, trControl = train_control, method = "knn")
model
## k-Nearest Neighbors
##
## 508 samples
## 8 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 457, 458, 457, 457, 457, 456, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.7243725 0.3770815
## 7 0.7304103 0.3803224
## 9 0.7441342 0.4138109
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
7. Plot the model
plot(model)

8. Predit using test data onto the model
predictions <- predict(model, newdata = data_test)
9. Evaluate the outcome
cml <- confusionMatrix(predictions, data_test$Outcome)
cml
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 111 30
## 1 31 44
##
## Accuracy : 0.7176
## 95% CI : (0.6525, 0.7766)
## No Information Rate : 0.6574
## P-Value [Acc > NIR] : 0.03505
##
## Kappa : 0.3751
##
## Mcnemar's Test P-Value : 1.00000
##
## Sensitivity : 0.7817
## Specificity : 0.5946
## Pos Pred Value : 0.7872
## Neg Pred Value : 0.5867
## Prevalence : 0.6574
## Detection Rate : 0.5139
## Detection Prevalence : 0.6528
## Balanced Accuracy : 0.6881
##
## 'Positive' Class : 0
##