library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
data(PimaIndiansDiabetes2, package = "mlbench")
colnames(PimaIndiansDiabetes2)
## [1] "pregnant" "glucose"  "pressure" "triceps"  "insulin"  "mass"     "pedigree"
## [8] "age"      "diabetes"
vapply(PimaIndiansDiabetes2, class, character(1))
##  pregnant   glucose  pressure   triceps   insulin      mass  pedigree       age 
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" 
##  diabetes 
##  "factor"
#Check for NA
vapply(PimaIndiansDiabetes2, anyNA, logical(1))
## pregnant  glucose pressure  triceps  insulin     mass pedigree      age 
##    FALSE     TRUE     TRUE     TRUE     TRUE     TRUE    FALSE    FALSE 
## diabetes 
##    FALSE
fresh_pima <- PimaIndiansDiabetes2 %>% drop_na
##2.1

logistic <- glm(diabetes ~ ., data = fresh_pima, family = binomial(link = "logit"))

summary(logistic)
## 
## Call:
## glm(formula = diabetes ~ ., family = binomial(link = "logit"), 
##     data = fresh_pima)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.004e+01  1.218e+00  -8.246  < 2e-16 ***
## pregnant     8.216e-02  5.543e-02   1.482  0.13825    
## glucose      3.827e-02  5.768e-03   6.635 3.24e-11 ***
## pressure    -1.420e-03  1.183e-02  -0.120  0.90446    
## triceps      1.122e-02  1.708e-02   0.657  0.51128    
## insulin     -8.253e-04  1.306e-03  -0.632  0.52757    
## mass         7.054e-02  2.734e-02   2.580  0.00989 ** 
## pedigree     1.141e+00  4.274e-01   2.669  0.00760 ** 
## age          3.395e-02  1.838e-02   1.847  0.06474 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 498.10  on 391  degrees of freedom
## Residual deviance: 344.02  on 383  degrees of freedom
## AIC: 362.02
## 
## Number of Fisher Scoring iterations: 5
#Get accuracy of logistic regression
logit_decision <- ifelse(logistic$fitted.values > 0.5, "pos", "neg")

logit_accuracy <- mean(logit_decision == fresh_pima$diabetes, na.rm = TRUE) * 100

logit_accuracy
## [1] 78.31633
#3.1 splitting data
set.seed(530306377)
train <- createDataPartition(fresh_pima$diabetes, p = 0.75)[[1]]

pima_train <- fresh_pima[train,]

pima_test <- fresh_pima[-train,]
#3.2

log_model <- train(diabetes~., data = pima_train,  method = "glm", family = binomial(link = "logit"), trControl = trainControl(method = "repeatedcv", repeats = 5))
log_model
## Generalized Linear Model 
## 
## 295 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 265, 266, 265, 266, 266, 265, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.7836765  0.4882072
lda_model <- train(diabetes~., data = pima_train, method = "lda", trControl = trainControl(method = "repeatedcv", repeats = 5))
lda_model
## Linear Discriminant Analysis 
## 
## 295 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 266, 266, 265, 265, 265, 266, ... 
## Resampling results:
## 
##   Accuracy  Kappa    
##   0.782069  0.4777004
knn_model <- train(diabetes~., data = pima_train, method = "knn", trControl = trainControl(method = "repeatedcv", repeats = 5))

knn_model
## k-Nearest Neighbors 
## 
## 295 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 266, 266, 265, 266, 265, 265, ... 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa    
##   5  0.7384023  0.3882928
##   7  0.7491839  0.4104230
##   9  0.7614384  0.4361467
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
svm_model <- train(diabetes~., data = pima_train, method = "svmLinearWeights", trControl = trainControl(method = "repeatedcv", repeats = 5))

svm_model
## Linear Support Vector Machines with Class Weights 
## 
## 295 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 266, 265, 265, 266, 265, 265, ... 
## Resampling results across tuning parameters:
## 
##   cost  weight  Accuracy   Kappa    
##   0.25  1       0.7760082  0.4665989
##   0.25  2       0.7708358  0.5029430
##   0.25  3       0.7559589  0.5048300
##   0.50  1       0.7800772  0.4763968
##   0.50  2       0.7701478  0.5006722
##   0.50  3       0.7566716  0.5052540
##   1.00  1       0.7821494  0.4816081
##   1.00  2       0.7694105  0.4997107
##   1.00  3       0.7539327  0.5001749
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were cost = 1 and weight = 1.
#3.3
log_pred <- predict(log_model, newdata = pima_test)

log_confusion <- confusionMatrix(log_pred, pima_test$diabetes)

log_confusion
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg  58  13
##        pos   7  19
##                                           
##                Accuracy : 0.7938          
##                  95% CI : (0.6997, 0.8693)
##     No Information Rate : 0.6701          
##     P-Value [Acc > NIR] : 0.005133        
##                                           
##                   Kappa : 0.5103          
##                                           
##  Mcnemar's Test P-Value : 0.263552        
##                                           
##             Sensitivity : 0.8923          
##             Specificity : 0.5938          
##          Pos Pred Value : 0.8169          
##          Neg Pred Value : 0.7308          
##              Prevalence : 0.6701          
##          Detection Rate : 0.5979          
##    Detection Prevalence : 0.7320          
##       Balanced Accuracy : 0.7430          
##                                           
##        'Positive' Class : neg             
## 
knn_pred <- predict(knn_model, newdata = pima_test)

knnconfusion <- confusionMatrix(knn_pred, pima_test$diabetes)
all_models <- list(log_model, lda_model, knn_model, svm_model)

matrices <- lapply(all_models, function(model){
  pred <- predict(model, newdata= pima_test)
  confusionMatrix(pred, pima_test$diabetes)
})

names(matrices) <- c("Logistic", "LDA", "kNN", "SVM")

results <- matrix(NA, nrow = 7, ncol = length(matrices))

for (i in seq_along(matrices)){
  
  results[, i] <- matrices[[i]][["overall"]]
  
  
}

results_df <- as.data.frame(results)
colnames(results_df) <- c("Logistic", "LDA", "kNN", "SVM")
rownames(results_df) <- names(matrices[[1]][["overall"]])
library(knitr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
table <- kable(results_df, caption = "Basic Accuracy Results for each Model")%>%
  kable_classic_2(c("striped", "hover"), html_font = "Cambria", full_width = F, position = "center")

table
Basic Accuracy Results for each Model
Logistic LDA kNN SVM
Accuracy 0.7938144 0.7731959 0.7422680 0.7628866
Kappa 0.5103483 0.4613831 0.3722495 0.4224696
AccuracyLower 0.6996838 0.6769959 0.6434684 0.6657558
AccuracyUpper 0.8692713 0.8520740 0.8257518 0.8433668
AccuracyNull 0.6701031 0.6701031 0.6701031 0.6701031
AccuracyPValue 0.0051329 0.0178257 0.0781344 0.0306572
McnemarPValue 0.2635525 0.2864220 0.1095986 0.0952928

Discussion

The accuracy for all 4 models are fairly similar ranging from kNN’s 74.22% –> Logistic Regression’s 79.38%. LDA performs slightly better than SVM but Logistic Regression is marginaly the most accurate model. Lets look at some other measures to get a better idea.

res <- matrix(NA, nrow = 11, ncol = length(matrices))

for (i in seq_along(matrices)){
  
  res[, i] <- matrices[[i]][["byClass"]]
  
  
}

res_df <- as.data.frame(res)
colnames(res_df) <- c("Logistic", "LDA", "kNN", "SVM")
rownames(res_df) <- names(matrices[[1]][["byClass"]])
table2 <- kable(res_df, caption = "Other Accuracy Results for each Model")%>%
  kable_classic_2(c("striped", "hover"), html_font = "Cambria", full_width = F, position = "center")

table2
Other Accuracy Results for each Model
Logistic LDA kNN SVM
Sensitivity 0.8923077 0.8769231 0.8769231 0.8923077
Specificity 0.5937500 0.5625000 0.4687500 0.5000000
Pos Pred Value 0.8169014 0.8028169 0.7702703 0.7837838
Neg Pred Value 0.7307692 0.6923077 0.6521739 0.6956522
Precision 0.8169014 0.8028169 0.7702703 0.7837838
Recall 0.8923077 0.8769231 0.8769231 0.8923077
F1 0.8529412 0.8382353 0.8201439 0.8345324
Prevalence 0.6701031 0.6701031 0.6701031 0.6701031
Detection Rate 0.5979381 0.5876289 0.5876289 0.5979381
Detection Prevalence 0.7319588 0.7319588 0.7628866 0.7628866
Balanced Accuracy 0.7430288 0.7197115 0.6728365 0.6961538

Discussion