1. Load the dataset and appropriate packages

library("readxl")
## Warning: 程辑包'readxl'是用R版本4.1.2 来建造的
library("dplyr")
## Warning: 程辑包'dplyr'是用R版本4.1.2 来建造的
## 
## 载入程辑包:'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("caret")
## Warning: 程辑包'caret'是用R版本4.1.2 来建造的
## 载入需要的程辑包:ggplot2
## Warning: 程辑包'ggplot2'是用R版本4.1.2 来建造的
## 载入需要的程辑包:lattice
df <- read_excel("labW9.xlsx")

2. Conduct data exploration and checking and cleaning if necessary

# Check df structure and there are 768rows and 9 columns
str(df)
## tibble [768 x 9] (S3: tbl_df/tbl/data.frame)
##  $ Pregnancies             : num [1:768] 6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : num [1:768] 148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : num [1:768] 72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : num [1:768] 35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : num [1:768] 0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num [1:768] 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num [1:768] 0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : num [1:768] 50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : num [1:768] 1 0 1 0 1 0 1 0 1 1 ...
# Check df summary 
summary(df)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000
# Check df column names
colnames(df)
## [1] "Pregnancies"              "Glucose"                 
## [3] "BloodPressure"            "SkinThickness"           
## [5] "Insulin"                  "BMI"                     
## [7] "DiabetesPedigreeFunction" "Age"                     
## [9] "Outcome"
# Check missing values
colSums(is.na(df))
##              Pregnancies                  Glucose            BloodPressure 
##                        0                        0                        0 
##            SkinThickness                  Insulin                      BMI 
##                        0                        0                        0 
## DiabetesPedigreeFunction                      Age                  Outcome 
##                        0                        0                        0
# Change outcome to factor
df$Outcome <- as.factor(df$Outcome)

# Check zero values cause Zero is the abnormal value of Glucose, BloodPressure, SkinThickness, Insulin and BMI 
colSums(df==0)
##              Pregnancies                  Glucose            BloodPressure 
##                      111                        5                       35 
##            SkinThickness                  Insulin                      BMI 
##                      227                      374                       11 
## DiabetesPedigreeFunction                      Age                  Outcome 
##                        0                        0                      500
# Replace zero values of Glucose, BloodPressure, SkinThickness, Insulin and BMI with NA
df$Glucose[df$Glucose==0] <- NA
df$BloodPressure[df$BloodPressure==0] <- NA
df$SkinThickness[df$SkinThickness==0] <- NA
df$Insulin[df$Insulin==0] <- NA
df$BMI[df$BMI==0] <- NA

# Check the missing values again to see whether it is acceptable to delete all these rows
colSums(is.na(df))
##              Pregnancies                  Glucose            BloodPressure 
##                        0                        5                       35 
##            SkinThickness                  Insulin                      BMI 
##                      227                      374                       11 
## DiabetesPedigreeFunction                      Age                  Outcome 
##                        0                        0                        0
# It is not suitable to remove all these rows, so replace NA of SkinThickness and Insulin with its average value
df[is.na(df$SkinThickness), "SkinThickness"] <- mean(df$SkinThickness, na.rm=TRUE)
df[is.na(df$Insulin), "Insulin"] <- mean(df$Insulin, na.rm=TRUE)

# Remove the remaining rows that contain NA
df <- na.omit(df)

3. Partition data 70/30

set.seed(100)
split <- 0.70
trainIndex <- createDataPartition(df$Outcome, p=split, list=FALSE)
data_train <- df[trainIndex, ]
data_test <- df[-trainIndex, ]

4. Check both training and test subsets

# Check training subset structure
str(data_train)
## tibble [508 x 9] (S3: tbl_df/tbl/data.frame)
##  $ Pregnancies             : num [1:508] 6 1 1 5 3 2 4 10 1 5 ...
##  $ Glucose                 : num [1:508] 148 85 89 116 78 197 110 168 189 166 ...
##  $ BloodPressure           : num [1:508] 72 66 66 74 50 70 92 74 60 72 ...
##  $ SkinThickness           : num [1:508] 35 29 23 29.2 32 ...
##  $ Insulin                 : num [1:508] 156 156 94 156 88 ...
##  $ BMI                     : num [1:508] 33.6 26.6 28.1 25.6 31 30.5 37.6 38 30.1 25.8 ...
##  $ DiabetesPedigreeFunction: num [1:508] 0.627 0.351 0.167 0.201 0.248 0.158 0.191 0.537 0.398 0.587 ...
##  $ Age                     : num [1:508] 50 31 21 30 26 53 30 34 59 51 ...
##  $ Outcome                 : Factor w/ 2 levels "0","1": 2 1 1 1 2 2 1 2 2 2 ...
##  - attr(*, "na.action")= 'omit' Named int [1:44] 8 10 16 50 61 76 79 82 146 173 ...
##   ..- attr(*, "names")= chr [1:44] "8" "10" "16" "50" ...
# Check test subset structure
str(data_test)
## tibble [216 x 9] (S3: tbl_df/tbl/data.frame)
##  $ Pregnancies             : num [1:216] 8 0 10 7 8 1 13 3 2 3 ...
##  $ Glucose                 : num [1:216] 183 137 139 107 99 97 145 158 90 180 ...
##  $ BloodPressure           : num [1:216] 64 40 80 74 84 66 82 76 68 64 ...
##  $ SkinThickness           : num [1:216] 29.2 35 29.2 29.2 29.2 ...
##  $ Insulin                 : num [1:216] 156 168 156 156 156 ...
##  $ BMI                     : num [1:216] 23.3 43.1 27.1 29.6 35.4 23.2 22.2 31.6 38.2 34 ...
##  $ DiabetesPedigreeFunction: num [1:216] 0.672 2.288 1.441 0.254 0.388 ...
##  $ Age                     : num [1:216] 32 33 57 31 50 22 57 28 27 26 ...
##  $ Outcome                 : Factor w/ 2 levels "0","1": 2 2 1 2 1 1 1 2 2 1 ...
##  - attr(*, "na.action")= 'omit' Named int [1:44] 8 10 16 50 61 76 79 82 146 173 ...
##   ..- attr(*, "names")= chr [1:44] "8" "10" "16" "50" ...

5. Check for cross validation

# ten times cross validation
train_control <- trainControl(method = "cv", number = 10)

6. Train test data

model <- train(Outcome~., data = data_train, trControl = train_control, method = "knn")
model
## k-Nearest Neighbors 
## 
## 508 samples
##   8 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 457, 458, 457, 457, 457, 456, ... 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa    
##   5  0.7243725  0.3770815
##   7  0.7304103  0.3803224
##   9  0.7441342  0.4138109
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.

7. Plot the model

plot(model)

8. Predit using test data onto the model

predictions <- predict(model, newdata = data_test)

9. Evaluate the outcome

cml <- confusionMatrix(predictions, data_test$Outcome)
cml
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 111  30
##          1  31  44
##                                           
##                Accuracy : 0.7176          
##                  95% CI : (0.6525, 0.7766)
##     No Information Rate : 0.6574          
##     P-Value [Acc > NIR] : 0.03505         
##                                           
##                   Kappa : 0.3751          
##                                           
##  Mcnemar's Test P-Value : 1.00000         
##                                           
##             Sensitivity : 0.7817          
##             Specificity : 0.5946          
##          Pos Pred Value : 0.7872          
##          Neg Pred Value : 0.5867          
##              Prevalence : 0.6574          
##          Detection Rate : 0.5139          
##    Detection Prevalence : 0.6528          
##       Balanced Accuracy : 0.6881          
##                                           
##        'Positive' Class : 0               
##