library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
data(PimaIndiansDiabetes2, package = "mlbench")
colnames(PimaIndiansDiabetes2)
## [1] "pregnant" "glucose" "pressure" "triceps" "insulin" "mass" "pedigree"
## [8] "age" "diabetes"
vapply(PimaIndiansDiabetes2, class, character(1))
## pregnant glucose pressure triceps insulin mass pedigree age
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## diabetes
## "factor"
#Check for NA
vapply(PimaIndiansDiabetes2, anyNA, logical(1))
## pregnant glucose pressure triceps insulin mass pedigree age
## FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE
## diabetes
## FALSE
fresh_pima <- PimaIndiansDiabetes2 %>% drop_na
##2.1
logistic <- glm(diabetes ~ ., data = fresh_pima, family = binomial(link = "logit"))
summary(logistic)
##
## Call:
## glm(formula = diabetes ~ ., family = binomial(link = "logit"),
## data = fresh_pima)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.004e+01 1.218e+00 -8.246 < 2e-16 ***
## pregnant 8.216e-02 5.543e-02 1.482 0.13825
## glucose 3.827e-02 5.768e-03 6.635 3.24e-11 ***
## pressure -1.420e-03 1.183e-02 -0.120 0.90446
## triceps 1.122e-02 1.708e-02 0.657 0.51128
## insulin -8.253e-04 1.306e-03 -0.632 0.52757
## mass 7.054e-02 2.734e-02 2.580 0.00989 **
## pedigree 1.141e+00 4.274e-01 2.669 0.00760 **
## age 3.395e-02 1.838e-02 1.847 0.06474 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 498.10 on 391 degrees of freedom
## Residual deviance: 344.02 on 383 degrees of freedom
## AIC: 362.02
##
## Number of Fisher Scoring iterations: 5
#Get accuracy of logistic regression
logit_decision <- ifelse(logistic$fitted.values > 0.5, "pos", "neg")
logit_accuracy <- mean(logit_decision == fresh_pima$diabetes, na.rm = TRUE) * 100
logit_accuracy
## [1] 78.31633
#3.1 splitting data
set.seed(530306377)
train <- createDataPartition(fresh_pima$diabetes, p = 0.75)[[1]]
pima_train <- fresh_pima[train,]
pima_test <- fresh_pima[-train,]
#3.2
log_model <- train(diabetes~., data = pima_train, method = "glm", family = binomial(link = "logit"), trControl = trainControl(method = "repeatedcv", repeats = 5))
log_model
## Generalized Linear Model
##
## 295 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 265, 266, 265, 266, 266, 265, ...
## Resampling results:
##
## Accuracy Kappa
## 0.7836765 0.4882072
lda_model <- train(diabetes~., data = pima_train, method = "lda", trControl = trainControl(method = "repeatedcv", repeats = 5))
lda_model
## Linear Discriminant Analysis
##
## 295 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 266, 266, 265, 265, 265, 266, ...
## Resampling results:
##
## Accuracy Kappa
## 0.782069 0.4777004
knn_model <- train(diabetes~., data = pima_train, method = "knn", trControl = trainControl(method = "repeatedcv", repeats = 5))
knn_model
## k-Nearest Neighbors
##
## 295 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 266, 266, 265, 266, 265, 265, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.7384023 0.3882928
## 7 0.7491839 0.4104230
## 9 0.7614384 0.4361467
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
svm_model <- train(diabetes~., data = pima_train, method = "svmLinearWeights", trControl = trainControl(method = "repeatedcv", repeats = 5))
svm_model
## Linear Support Vector Machines with Class Weights
##
## 295 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 266, 265, 265, 266, 265, 265, ...
## Resampling results across tuning parameters:
##
## cost weight Accuracy Kappa
## 0.25 1 0.7760082 0.4665989
## 0.25 2 0.7708358 0.5029430
## 0.25 3 0.7559589 0.5048300
## 0.50 1 0.7800772 0.4763968
## 0.50 2 0.7701478 0.5006722
## 0.50 3 0.7566716 0.5052540
## 1.00 1 0.7821494 0.4816081
## 1.00 2 0.7694105 0.4997107
## 1.00 3 0.7539327 0.5001749
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were cost = 1 and weight = 1.
#3.3
log_pred <- predict(log_model, newdata = pima_test)
log_confusion <- confusionMatrix(log_pred, pima_test$diabetes)
log_confusion
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 58 13
## pos 7 19
##
## Accuracy : 0.7938
## 95% CI : (0.6997, 0.8693)
## No Information Rate : 0.6701
## P-Value [Acc > NIR] : 0.005133
##
## Kappa : 0.5103
##
## Mcnemar's Test P-Value : 0.263552
##
## Sensitivity : 0.8923
## Specificity : 0.5938
## Pos Pred Value : 0.8169
## Neg Pred Value : 0.7308
## Prevalence : 0.6701
## Detection Rate : 0.5979
## Detection Prevalence : 0.7320
## Balanced Accuracy : 0.7430
##
## 'Positive' Class : neg
##
knn_pred <- predict(knn_model, newdata = pima_test)
knnconfusion <- confusionMatrix(knn_pred, pima_test$diabetes)
all_models <- list(log_model, lda_model, knn_model, svm_model)
matrices <- lapply(all_models, function(model){
pred <- predict(model, newdata= pima_test)
confusionMatrix(pred, pima_test$diabetes)
})
names(matrices) <- c("Logistic", "LDA", "kNN", "SVM")
results <- matrix(NA, nrow = 7, ncol = length(matrices))
for (i in seq_along(matrices)){
results[, i] <- matrices[[i]][["overall"]]
}
results_df <- as.data.frame(results)
colnames(results_df) <- c("Logistic", "LDA", "kNN", "SVM")
rownames(results_df) <- names(matrices[[1]][["overall"]])
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
table <- kable(results_df, caption = "Basic Accuracy Results for each Model")%>%
kable_classic_2(c("striped", "hover"), html_font = "Cambria", full_width = F, position = "center")
table
| Logistic | LDA | kNN | SVM | |
|---|---|---|---|---|
| Accuracy | 0.7938144 | 0.7731959 | 0.7422680 | 0.7628866 |
| Kappa | 0.5103483 | 0.4613831 | 0.3722495 | 0.4224696 |
| AccuracyLower | 0.6996838 | 0.6769959 | 0.6434684 | 0.6657558 |
| AccuracyUpper | 0.8692713 | 0.8520740 | 0.8257518 | 0.8433668 |
| AccuracyNull | 0.6701031 | 0.6701031 | 0.6701031 | 0.6701031 |
| AccuracyPValue | 0.0051329 | 0.0178257 | 0.0781344 | 0.0306572 |
| McnemarPValue | 0.2635525 | 0.2864220 | 0.1095986 | 0.0952928 |
The accuracy for all 4 models are fairly similar ranging from kNN’s 74.22% –> Logistic Regression’s 79.38%. LDA performs slightly better than SVM but Logistic Regression is marginaly the most accurate model. Lets look at some other measures to get a better idea.
res <- matrix(NA, nrow = 11, ncol = length(matrices))
for (i in seq_along(matrices)){
res[, i] <- matrices[[i]][["byClass"]]
}
res_df <- as.data.frame(res)
colnames(res_df) <- c("Logistic", "LDA", "kNN", "SVM")
rownames(res_df) <- names(matrices[[1]][["byClass"]])
table2 <- kable(res_df, caption = "Other Accuracy Results for each Model")%>%
kable_classic_2(c("striped", "hover"), html_font = "Cambria", full_width = F, position = "center")
table2
| Logistic | LDA | kNN | SVM | |
|---|---|---|---|---|
| Sensitivity | 0.8923077 | 0.8769231 | 0.8769231 | 0.8923077 |
| Specificity | 0.5937500 | 0.5625000 | 0.4687500 | 0.5000000 |
| Pos Pred Value | 0.8169014 | 0.8028169 | 0.7702703 | 0.7837838 |
| Neg Pred Value | 0.7307692 | 0.6923077 | 0.6521739 | 0.6956522 |
| Precision | 0.8169014 | 0.8028169 | 0.7702703 | 0.7837838 |
| Recall | 0.8923077 | 0.8769231 | 0.8769231 | 0.8923077 |
| F1 | 0.8529412 | 0.8382353 | 0.8201439 | 0.8345324 |
| Prevalence | 0.6701031 | 0.6701031 | 0.6701031 | 0.6701031 |
| Detection Rate | 0.5979381 | 0.5876289 | 0.5876289 | 0.5979381 |
| Detection Prevalence | 0.7319588 | 0.7319588 | 0.7628866 | 0.7628866 |
| Balanced Accuracy | 0.7430288 | 0.7197115 | 0.6728365 | 0.6961538 |
Logistic Regression and SVM tie for the highest score in Sensitivity. Sensitivity is a measure of how well a model detects true positive instances.
All of our models do not do very will in specificity with the highest score being Logistic Regression (59.37%). Specificity measures how good the model is at finding true negatives.
Overall the trend from the basic accuracy results continues where Logistic Regression appears to be the superior model and kNN is the worst model. SVM does a very good job at detecting true positives but a mediocre job at classifying true negatves. LDA does pretty well on all measure but does not stand out in anything.