Load the dataset and appropriate packages
library(caret)
## Warning: package 'caret' was built under R version 4.1.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.1.2
## Loading required package: lattice
library(readxl)
## Warning: package 'readxl' was built under R version 4.1.2
data1<-read_excel('/Users/aziah/Documents/RProject/labW9.xlsx', 1)
head(data1)
## # A tibble: 6 x 9
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigre~
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6 148 72 35 0 33.6 0.627
## 2 1 85 66 29 0 26.6 0.351
## 3 8 183 64 0 0 23.3 0.672
## 4 1 89 66 23 94 28.1 0.167
## 5 0 137 40 35 168 43.1 2.29
## 6 5 116 74 0 0 25.6 0.201
## # ... with 2 more variables: Age <dbl>, Outcome <dbl>
Conduct data exploration and checking and cleaning if necessary
summary(data1)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
colnames(data1)
## [1] "Pregnancies" "Glucose"
## [3] "BloodPressure" "SkinThickness"
## [5] "Insulin" "BMI"
## [7] "DiabetesPedigreeFunction" "Age"
## [9] "Outcome"
str(data1)
## tibble [768 x 9] (S3: tbl_df/tbl/data.frame)
## $ Pregnancies : num [1:768] 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : num [1:768] 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : num [1:768] 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : num [1:768] 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : num [1:768] 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num [1:768] 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num [1:768] 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : num [1:768] 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : num [1:768] 1 0 1 0 1 0 1 0 1 1 ...
colSums(is.na(data1))
## Pregnancies Glucose BloodPressure
## 0 0 0
## SkinThickness Insulin BMI
## 0 0 0
## DiabetesPedigreeFunction Age Outcome
## 0 0 0
data1$Outcome <- as.factor(data1$Outcome)
Partition data 70/30 using any method you feel comfortable with
split = 0.7
trainIndex <- createDataPartition(data1$Outcome, p = split, list = FALSE)
data1_train <- data1[trainIndex, ]
data1_test <- data1[-trainIndex, ]
Check both your training and test subsets
nrow(data1_train)
## [1] 538
nrow(data1_test)
## [1] 230
Check for cross validation if the model allows for it
cross_validate <- trainControl(method = "cv", number = 5)
Train your test data using any model you feel is appropriate
set.seed(100)
knn_model <- train(Outcome~., data = data1_train, trControl = cross_validate, method = "knn")
knn_model
## k-Nearest Neighbors
##
## 538 samples
## 8 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 431, 430, 431, 430, 430
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.7267567 0.3734897
## 7 0.7378851 0.3974358
## 9 0.7509346 0.4266183
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
Plot your model
plot(knn_model)
Predict using your test data onto your mode
knn_predict <- predict(knn_model, newdata = data1_test)
knn_predict
## [1] 0 0 1 1 1 1 0 0 1 1 1 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 1
## [38] 0 1 0 0 0 0 0 0 1 0 0 1 1 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0 1
## [75] 0 1 1 0 0 0 1 0 0 0 0 1 1 0 1 0 0 1 0 1 0 0 1 0 0 1 1 0 0 1 0 0 1 0 1 1 0
## [112] 1 0 0 0 0 0 0 1 0 0 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0
## [149] 1 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 0 1 0
## [186] 1 0 0 0 1 0 1 0 1 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0
## [223] 1 0 0 0 1 1 0 0
## Levels: 0 1
Evaluate your outcome using any suitable method
confusionMatrix(knn_predict, data1_test$Outcome)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 117 34
## 1 33 46
##
## Accuracy : 0.7087
## 95% CI : (0.6454, 0.7666)
## No Information Rate : 0.6522
## P-Value [Acc > NIR] : 0.04036
##
## Kappa : 0.356
##
## Mcnemar's Test P-Value : 1.00000
##
## Sensitivity : 0.7800
## Specificity : 0.5750
## Pos Pred Value : 0.7748
## Neg Pred Value : 0.5823
## Prevalence : 0.6522
## Detection Rate : 0.5087
## Detection Prevalence : 0.6565
## Balanced Accuracy : 0.6775
##
## 'Positive' Class : 0
##