Tutorial Week 5

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

data(PimaIndiansDiabetes2, package = "mlbench")

colnames(PimaIndiansDiabetes2)

## [1] "pregnant" "glucose"  "pressure" "triceps"  "insulin"  "mass"     "pedigree"
## [8] "age"      "diabetes"

vapply(PimaIndiansDiabetes2, class, character(1))

##  pregnant   glucose  pressure   triceps   insulin      mass  pedigree       age 
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" 
##  diabetes 
##  "factor"

#Check for NA
vapply(PimaIndiansDiabetes2, anyNA, logical(1))

## pregnant  glucose pressure  triceps  insulin     mass pedigree      age 
##    FALSE     TRUE     TRUE     TRUE     TRUE     TRUE    FALSE    FALSE 
## diabetes 
##    FALSE

fresh_pima <- PimaIndiansDiabetes2 %>% drop_na

##2.1

logistic <- glm(diabetes ~ ., data = fresh_pima, family = binomial(link = "logit"))

summary(logistic)

## 
## Call:
## glm(formula = diabetes ~ ., family = binomial(link = "logit"), 
##     data = fresh_pima)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.004e+01  1.218e+00  -8.246  < 2e-16 ***
## pregnant     8.216e-02  5.543e-02   1.482  0.13825    
## glucose      3.827e-02  5.768e-03   6.635 3.24e-11 ***
## pressure    -1.420e-03  1.183e-02  -0.120  0.90446    
## triceps      1.122e-02  1.708e-02   0.657  0.51128    
## insulin     -8.253e-04  1.306e-03  -0.632  0.52757    
## mass         7.054e-02  2.734e-02   2.580  0.00989 ** 
## pedigree     1.141e+00  4.274e-01   2.669  0.00760 ** 
## age          3.395e-02  1.838e-02   1.847  0.06474 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 498.10  on 391  degrees of freedom
## Residual deviance: 344.02  on 383  degrees of freedom
## AIC: 362.02
## 
## Number of Fisher Scoring iterations: 5

#Get accuracy of logistic regression
logit_decision <- ifelse(logistic$fitted.values > 0.5, "pos", "neg")

logit_accuracy <- mean(logit_decision == fresh_pima$diabetes, na.rm = TRUE) * 100

logit_accuracy

## [1] 78.31633

#3.1 splitting data
set.seed(530306377)
train <- createDataPartition(fresh_pima$diabetes, p = 0.75)[[1]]

pima_train <- fresh_pima[train,]

pima_test <- fresh_pima[-train,]

#3.2

log_model <- train(diabetes~., data = pima_train,  method = "glm", family = binomial(link = "logit"), trControl = trainControl(method = "repeatedcv", repeats = 5))
log_model

## Generalized Linear Model 
## 
## 295 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 265, 266, 265, 266, 266, 265, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.7836765  0.4882072

lda_model <- train(diabetes~., data = pima_train, method = "lda", trControl = trainControl(method = "repeatedcv", repeats = 5))
lda_model

## Linear Discriminant Analysis 
## 
## 295 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 266, 266, 265, 265, 265, 266, ... 
## Resampling results:
## 
##   Accuracy  Kappa    
##   0.782069  0.4777004

knn_model <- train(diabetes~., data = pima_train, method = "knn", trControl = trainControl(method = "repeatedcv", repeats = 5))

knn_model

## k-Nearest Neighbors 
## 
## 295 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 266, 266, 265, 266, 265, 265, ... 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa    
##   5  0.7384023  0.3882928
##   7  0.7491839  0.4104230
##   9  0.7614384  0.4361467
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.

svm_model <- train(diabetes~., data = pima_train, method = "svmLinearWeights", trControl = trainControl(method = "repeatedcv", repeats = 5))

svm_model

## Linear Support Vector Machines with Class Weights 
## 
## 295 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 266, 265, 265, 266, 265, 265, ... 
## Resampling results across tuning parameters:
## 
##   cost  weight  Accuracy   Kappa    
##   0.25  1       0.7760082  0.4665989
##   0.25  2       0.7708358  0.5029430
##   0.25  3       0.7559589  0.5048300
##   0.50  1       0.7800772  0.4763968
##   0.50  2       0.7701478  0.5006722
##   0.50  3       0.7566716  0.5052540
##   1.00  1       0.7821494  0.4816081
##   1.00  2       0.7694105  0.4997107
##   1.00  3       0.7539327  0.5001749
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were cost = 1 and weight = 1.

#3.3
log_pred <- predict(log_model, newdata = pima_test)

log_confusion <- confusionMatrix(log_pred, pima_test$diabetes)

log_confusion

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg  58  13
##        pos   7  19
##                                           
##                Accuracy : 0.7938          
##                  95% CI : (0.6997, 0.8693)
##     No Information Rate : 0.6701          
##     P-Value [Acc > NIR] : 0.005133        
##                                           
##                   Kappa : 0.5103          
##                                           
##  Mcnemar's Test P-Value : 0.263552        
##                                           
##             Sensitivity : 0.8923          
##             Specificity : 0.5938          
##          Pos Pred Value : 0.8169          
##          Neg Pred Value : 0.7308          
##              Prevalence : 0.6701          
##          Detection Rate : 0.5979          
##    Detection Prevalence : 0.7320          
##       Balanced Accuracy : 0.7430          
##                                           
##        'Positive' Class : neg             
##

knn_pred <- predict(knn_model, newdata = pima_test)

knnconfusion <- confusionMatrix(knn_pred, pima_test$diabetes)

all_models <- list(log_model, lda_model, knn_model, svm_model)

matrices <- lapply(all_models, function(model){
  pred <- predict(model, newdata= pima_test)
  confusionMatrix(pred, pima_test$diabetes)
})

names(matrices) <- c("Logistic", "LDA", "kNN", "SVM")

results <- matrix(NA, nrow = 7, ncol = length(matrices))

for (i in seq_along(matrices)){
  
  results[, i] <- matrices[[i]][["overall"]]
  
  
}

results_df <- as.data.frame(results)
colnames(results_df) <- c("Logistic", "LDA", "kNN", "SVM")
rownames(results_df) <- names(matrices[[1]][["overall"]])

library(knitr)
library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

table <- kable(results_df, caption = "Basic Accuracy Results for each Model")%>%
  kable_classic_2(c("striped", "hover"), html_font = "Cambria", full_width = F, position = "center")

table

Basic Accuracy Results for each Model
	Logistic	LDA	kNN	SVM
Accuracy	0.7938144	0.7731959	0.7422680	0.7628866
Kappa	0.5103483	0.4613831	0.3722495	0.4224696
AccuracyLower	0.6996838	0.6769959	0.6434684	0.6657558
AccuracyUpper	0.8692713	0.8520740	0.8257518	0.8433668
AccuracyNull	0.6701031	0.6701031	0.6701031	0.6701031
AccuracyPValue	0.0051329	0.0178257	0.0781344	0.0306572
McnemarPValue	0.2635525	0.2864220	0.1095986	0.0952928

Discussion

The accuracy for all 4 models are fairly similar ranging from kNN’s 74.22% –> Logistic Regression’s 79.38%. LDA performs slightly better than SVM but Logistic Regression is marginaly the most accurate model. Lets look at some other measures to get a better idea.

res <- matrix(NA, nrow = 11, ncol = length(matrices))

for (i in seq_along(matrices)){
  
  res[, i] <- matrices[[i]][["byClass"]]
  
  
}

res_df <- as.data.frame(res)
colnames(res_df) <- c("Logistic", "LDA", "kNN", "SVM")
rownames(res_df) <- names(matrices[[1]][["byClass"]])

table2 <- kable(res_df, caption = "Other Accuracy Results for each Model")%>%
  kable_classic_2(c("striped", "hover"), html_font = "Cambria", full_width = F, position = "center")

table2

Other Accuracy Results for each Model
	Logistic	LDA	kNN	SVM
Sensitivity	0.8923077	0.8769231	0.8769231	0.8923077
Specificity	0.5937500	0.5625000	0.4687500	0.5000000
Pos Pred Value	0.8169014	0.8028169	0.7702703	0.7837838
Neg Pred Value	0.7307692	0.6923077	0.6521739	0.6956522
Precision	0.8169014	0.8028169	0.7702703	0.7837838
Recall	0.8923077	0.8769231	0.8769231	0.8923077
F1	0.8529412	0.8382353	0.8201439	0.8345324
Prevalence	0.6701031	0.6701031	0.6701031	0.6701031
Detection Rate	0.5979381	0.5876289	0.5876289	0.5979381
Detection Prevalence	0.7319588	0.7319588	0.7628866	0.7628866
Balanced Accuracy	0.7430288	0.7197115	0.6728365	0.6961538

Discussion

Logistic Regression and SVM tie for the highest score in Sensitivity. Sensitivity is a measure of how well a model detects true positive instances.
All of our models do not do very will in specificity with the highest score being Logistic Regression (59.37%). Specificity measures how good the model is at finding true negatives.
Overall the trend from the basic accuracy results continues where Logistic Regression appears to be the superior model and kNN is the worst model. SVM does a very good job at detecting true positives but a mediocre job at classifying true negatves. LDA does pretty well on all measure but does not stand out in anything.

Tutorial Week 5

2023-08-30

Discussion

Discussion