W10LabTutorial

Load the dataset and appropriate packages

library(caret)

## Warning: package 'caret' was built under R version 4.1.2

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.1.2

## Loading required package: lattice

library(readxl)

## Warning: package 'readxl' was built under R version 4.1.2

data1<-read_excel('/Users/aziah/Documents/RProject/labW9.xlsx', 1)
head(data1)

## # A tibble: 6 x 9
##   Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI DiabetesPedigre~
##         <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>            <dbl>
## 1           6     148            72            35       0  33.6            0.627
## 2           1      85            66            29       0  26.6            0.351
## 3           8     183            64             0       0  23.3            0.672
## 4           1      89            66            23      94  28.1            0.167
## 5           0     137            40            35     168  43.1            2.29 
## 6           5     116            74             0       0  25.6            0.201
## # ... with 2 more variables: Age <dbl>, Outcome <dbl>

Conduct data exploration and checking and cleaning if necessary

summary(data1)

##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000

colnames(data1)

## [1] "Pregnancies"              "Glucose"                 
## [3] "BloodPressure"            "SkinThickness"           
## [5] "Insulin"                  "BMI"                     
## [7] "DiabetesPedigreeFunction" "Age"                     
## [9] "Outcome"

str(data1)

## tibble [768 x 9] (S3: tbl_df/tbl/data.frame)
##  $ Pregnancies             : num [1:768] 6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : num [1:768] 148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : num [1:768] 72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : num [1:768] 35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : num [1:768] 0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num [1:768] 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num [1:768] 0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : num [1:768] 50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : num [1:768] 1 0 1 0 1 0 1 0 1 1 ...

colSums(is.na(data1))

##              Pregnancies                  Glucose            BloodPressure 
##                        0                        0                        0 
##            SkinThickness                  Insulin                      BMI 
##                        0                        0                        0 
## DiabetesPedigreeFunction                      Age                  Outcome 
##                        0                        0                        0

data1$Outcome <- as.factor(data1$Outcome)

Partition data 70/30 using any method you feel comfortable with

split = 0.7
trainIndex <- createDataPartition(data1$Outcome, p = split, list = FALSE)
data1_train <- data1[trainIndex, ]
data1_test <- data1[-trainIndex, ]

Check both your training and test subsets

nrow(data1_train)

## [1] 538

nrow(data1_test)

## [1] 230

Check for cross validation if the model allows for it

cross_validate <- trainControl(method = "cv", number = 5)

Train your test data using any model you feel is appropriate

set.seed(100)
knn_model <- train(Outcome~., data = data1_train, trControl = cross_validate, method = "knn")
knn_model

## k-Nearest Neighbors 
## 
## 538 samples
##   8 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 431, 430, 431, 430, 430 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa    
##   5  0.7267567  0.3734897
##   7  0.7378851  0.3974358
##   9  0.7509346  0.4266183
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.

Plot your model

plot(knn_model)

Predict using your test data onto your mode

knn_predict <- predict(knn_model, newdata = data1_test)
knn_predict

##   [1] 0 0 1 1 1 1 0 0 1 1 1 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 1
##  [38] 0 1 0 0 0 0 0 0 1 0 0 1 1 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0 1
##  [75] 0 1 1 0 0 0 1 0 0 0 0 1 1 0 1 0 0 1 0 1 0 0 1 0 0 1 1 0 0 1 0 0 1 0 1 1 0
## [112] 1 0 0 0 0 0 0 1 0 0 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0
## [149] 1 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 0 1 0
## [186] 1 0 0 0 1 0 1 0 1 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0
## [223] 1 0 0 0 1 1 0 0
## Levels: 0 1

Evaluate your outcome using any suitable method

confusionMatrix(knn_predict, data1_test$Outcome)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 117  34
##          1  33  46
##                                           
##                Accuracy : 0.7087          
##                  95% CI : (0.6454, 0.7666)
##     No Information Rate : 0.6522          
##     P-Value [Acc > NIR] : 0.04036         
##                                           
##                   Kappa : 0.356           
##                                           
##  Mcnemar's Test P-Value : 1.00000         
##                                           
##             Sensitivity : 0.7800          
##             Specificity : 0.5750          
##          Pos Pred Value : 0.7748          
##          Neg Pred Value : 0.5823          
##              Prevalence : 0.6522          
##          Detection Rate : 0.5087          
##    Detection Prevalence : 0.6565          
##       Balanced Accuracy : 0.6775          
##                                           
##        'Positive' Class : 0               
##

W10LabTutorial

Noraziah Suliman

1/6/2022