diabetes <- read.csv('diabetes.csv')
head(diabetes)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
summary(diabetes)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
library(ggplot2)
# par(mfrow=c(3,3))
# Loop through each column
for (col in names(diabetes)) {
# Check if the column is numeric
hist(diabetes[[col]],
main=col,
xlab=col)
}
Notable things: - Glucose, BloodPressure, SkinThickness, BMI, and Insulin all have values of 0 or near zero that are not real. must figure out a way to deal with these. - there is no missing data. it does appear that the missing data has been imputed; the zeros were imputed in, whether intentionally or not. - essentially only BP and BMI are normal (once you take out the weird fake values). you need to do box cox or yeo johnson transformations on everything
All of them are continuous except the outcome variable
For preprocessing, things to do include: - check for multicolinearity - check for correlation - deal with missing/unrealistic ones: i expect to convert these into NaN and then impute them/remove them. - near zero var - check for outliers and normality problems (i expect fixing zeros will help in this) - center and scaling
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)
## corrplot 0.94 loaded
corrs <- cor(diabetes[,-9])
corrplot(corrs)
If we look, the highest correlation is age and pregnancies which makes sense. The correlation is at 0.5, so we do not need to remove it. Now for multicolinearity.
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
# Fit a logistic regression model using GLM
glm_model <- glm(Outcome ~ ., data = diabetes, family = binomial)
# Calculate VIF for the GLM model
vif_values <- vif(glm_model)
# Print VIF values
print(vif_values)
## Pregnancies Glucose BloodPressure
## 1.408434 1.214367 1.175283
## SkinThickness Insulin BMI
## 1.522040 1.467918 1.220416
## DiabetesPedigreeFunction Age
## 1.034318 1.502069
None of the VIF are higher than 10, no issues with multicolinearity.
Glucose, BloodPressure, SkinThickness, BMI all have values of 0 which are impossible. They are not outliers, but rather incorrectly inputted information. I believe these should be treated as missing values. Let us change them to missing values and then repeat MICE, to see what we should do for imputation.
incorrect.cols <- c('Glucose','BloodPressure','SkinThickness','Insulin','BMI') # has the messed up columns
diabetes.corrected <- diabetes
# Loop through each column in the vector
for (col in incorrect.cols) {
# Replace 0 with NaN
diabetes.corrected[[col]][diabetes.corrected[[col]] == 0] <- NaN
}
# Print the updated dataframe to verify changes
summary(diabetes.corrected)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 44.0 Min. : 24.00 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.00 1st Qu.:22.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :29.00
## Mean : 3.845 Mean :121.7 Mean : 72.41 Mean :29.15
## 3rd Qu.: 6.000 3rd Qu.:141.0 3rd Qu.: 80.00 3rd Qu.:36.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## NA's :5 NA's :35 NA's :227
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 14.00 Min. :18.20 Min. :0.0780 Min. :21.00
## 1st Qu.: 76.25 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00
## Median :125.00 Median :32.30 Median :0.3725 Median :29.00
## Mean :155.55 Mean :32.46 Mean :0.4719 Mean :33.24
## 3rd Qu.:190.00 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.00 Max. :67.10 Max. :2.4200 Max. :81.00
## NA's :374 NA's :11
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
##
Now using MICE:
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked _by_ '.GlobalEnv':
##
## diabetes
## The following object is masked from 'package:datasets':
##
## sleep
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
# Summary of the missing data pattern
md_pattern <- md.pattern(diabetes.corrected)
print(md_pattern)
## Pregnancies DiabetesPedigreeFunction Age Outcome Glucose BMI BloodPressure
## 392 1 1 1 1 1 1 1
## 140 1 1 1 1 1 1 1
## 192 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 0
## 26 1 1 1 1 1 1 0
## 1 1 1 1 1 1 0 1
## 1 1 1 1 1 1 0 1
## 2 1 1 1 1 1 0 1
## 7 1 1 1 1 1 0 0
## 1 1 1 1 1 0 1 1
## 4 1 1 1 1 0 1 1
## 0 0 0 0 5 11 35
## SkinThickness Insulin
## 392 1 1 0
## 140 1 0 1
## 192 0 0 2
## 2 1 0 2
## 26 0 0 3
## 1 1 1 1
## 1 1 0 2
## 2 0 0 3
## 7 0 0 4
## 1 1 1 1
## 4 1 0 2
## 227 374 652
aggr_plot <- aggr(diabetes.corrected, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(diabetes.corrected), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## Insulin 0.486979167
## SkinThickness 0.295572917
## BloodPressure 0.045572917
## BMI 0.014322917
## Glucose 0.006510417
## Pregnancies 0.000000000
## DiabetesPedigreeFunction 0.000000000
## Age 0.000000000
## Outcome 0.000000000
Let’s look at a few of these that are missing and see if there is something about them: (this doesnt have to be kept)
# Check if NaN values exist (in R, NaN can be checked with is.nan)
rows_with_nan <- is.nan(diabetes.corrected$SkinThickness)
# Create a table of the 'Outcome' column for those rows
outcome_table <- table(diabetes.corrected$Outcome[rows_with_nan])
# Print the table
print(outcome_table)
##
## 0 1
## 139 88
We will use MICE to generate 5 imputed datasets. This can help deal with variation of the missing data. For now we will only use the first one.
diabetes.imputed <- mice(diabetes.corrected, m=5, method='pmm') # generates 5 imputed datasets
##
## iter imp variable
## 1 1 Glucose BloodPressure SkinThickness Insulin BMI
## 1 2 Glucose BloodPressure SkinThickness Insulin BMI
## 1 3 Glucose BloodPressure SkinThickness Insulin BMI
## 1 4 Glucose BloodPressure SkinThickness Insulin BMI
## 1 5 Glucose BloodPressure SkinThickness Insulin BMI
## 2 1 Glucose BloodPressure SkinThickness Insulin BMI
## 2 2 Glucose BloodPressure SkinThickness Insulin BMI
## 2 3 Glucose BloodPressure SkinThickness Insulin BMI
## 2 4 Glucose BloodPressure SkinThickness Insulin BMI
## 2 5 Glucose BloodPressure SkinThickness Insulin BMI
## 3 1 Glucose BloodPressure SkinThickness Insulin BMI
## 3 2 Glucose BloodPressure SkinThickness Insulin BMI
## 3 3 Glucose BloodPressure SkinThickness Insulin BMI
## 3 4 Glucose BloodPressure SkinThickness Insulin BMI
## 3 5 Glucose BloodPressure SkinThickness Insulin BMI
## 4 1 Glucose BloodPressure SkinThickness Insulin BMI
## 4 2 Glucose BloodPressure SkinThickness Insulin BMI
## 4 3 Glucose BloodPressure SkinThickness Insulin BMI
## 4 4 Glucose BloodPressure SkinThickness Insulin BMI
## 4 5 Glucose BloodPressure SkinThickness Insulin BMI
## 5 1 Glucose BloodPressure SkinThickness Insulin BMI
## 5 2 Glucose BloodPressure SkinThickness Insulin BMI
## 5 3 Glucose BloodPressure SkinThickness Insulin BMI
## 5 4 Glucose BloodPressure SkinThickness Insulin BMI
## 5 5 Glucose BloodPressure SkinThickness Insulin BMI
# summary(diabetes.imputed)
diabetes.preprocessed <- complete(diabetes.imputed, 1) # returns first imputed dataset
Now let’s look at the histograms again:
par(mfrow=c(3,3))
# Loop through each column
for (col in names(diabetes.preprocessed)) {
# Check if the column is numeric
hist(diabetes.preprocessed[[col]],
main=col,
xlab=col)
}
pairs(diabetes.preprocessed)
Note, we also see some new correlation between BMI and skin thickness. I checked and it is 0.63 ish. High but not an issue. There may be randomness with MICE so this may be something to revisit later on, but I don’t think it will be an issue.
library(caret)
## Loading required package: lattice
degenerate <- nearZeroVar(diabetes.preprocessed)
There are no degenerate variables. We can move on.
This one is a bit trickier. Will look at later.
model <- glm(Outcome ~ ., data = diabetes.preprocessed, family = binomial)
# Calculate Cook's Distance
cooks <- cooks.distance(model)
# Plot Cook's Distance
plot(cooks, type = "h", col = "blue", lwd = 2, ylab = "Cook's Distance", main = "Cook's Distance")
# Add a horizontal line for the threshold (typically 4/(n-k-1))
# abline(h = 4/(nrow(diabetes.preprocessed) - length(model$coefficients)), col = "red", lty = 2)
abline(h = 4/(nrow(diabetes.preprocessed)), col = "red", lty = 2)
# Identify potentially influential points
# influential_points <- cooks[cooks > 4/(nrow(diabetes.preprocessed) - length(model$coefficients))]
influential_points <- cooks[cooks > 4/(nrow(diabetes.preprocessed))]
length(influential_points)
## [1] 49
50 influential points, about 1/16 of the data is influential points. That is kind of significant. Let’s see what kind of influential points these are:
influential.indices <- which(cooks > 4/(nrow(diabetes.preprocessed)))
diabetes.influential <- diabetes.preprocessed[influential.indices,]
summary(diabetes.influential)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 78.0 Min. : 30.00 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.:107.0 1st Qu.: 72.00 1st Qu.:24.00
## Median : 3.000 Median :134.0 Median : 80.00 Median :31.00
## Mean : 4.306 Mean :136.4 Mean : 78.67 Mean :30.45
## 3rd Qu.: 7.000 3rd Qu.:164.0 3rd Qu.: 85.00 3rd Qu.:39.00
## Max. :13.000 Max. :197.0 Max. :110.00 Max. :52.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 29.0 Min. :22.90 Min. :0.1260 Min. :22.00
## 1st Qu.: 94.0 1st Qu.:29.00 1st Qu.:0.2480 1st Qu.:26.00
## Median :180.0 Median :34.00 Median :0.4600 Median :39.00
## Mean :259.8 Mean :35.49 Mean :0.6191 Mean :40.37
## 3rd Qu.:310.0 3rd Qu.:40.50 3rd Qu.:0.8550 3rd Qu.:51.00
## Max. :846.0 Max. :57.30 Max. :2.3290 Max. :81.00
## Outcome
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.4286
## 3rd Qu.:1.0000
## Max. :1.0000
For right now, they seem a little problematic. Let’s remove them all; this may not be the best strategy so lets return to it later.
diabetes.no.influential <- diabetes.preprocessed[-influential.indices,]
#diabetes.no.influential <- diabetes.preprocessed
# recalculate cooks distance
model1 <- glm(Outcome ~ ., data = diabetes.no.influential, family = binomial)
# Calculate Cook's Distance
cooks1 <- cooks.distance(model1)
# Plot Cook's Distance
plot(cooks1, type = "h", col = "blue", lwd = 2, ylab = "Cook's Distance", main = "Cook's Distance")
# Add a horizontal line for the threshold (typically 4/(n-k-1))
# abline(h = 4/(nrow(diabetes.preprocessed) - length(model$coefficients)), col = "red", lty = 2)
abline(h = 4/(nrow(diabetes.no.influential)), col = "red", lty = 2)
influential_points <- cooks[cooks > 4/(nrow(diabetes.no.influential))]
length(influential_points)
## [1] 42
Some outliers are still present, but the magnitude is much lower. We can leave them in for now and just keep it in the back of our minds. If necessary we can make changes later.
plot(model1)
I do not know if this is normal or not. It looks off to me but I am not sure. Let’s revisit later and assume everything is ok.
diabetes.complete <- as.data.frame(scale(diabetes.no.influential[,-9]))
diabetes.complete$Outcome <- as.factor(diabetes.no.influential$Outcome)
par(mfrow=c(3,3))
# Loop through each column
for (col in names(diabetes.complete[,-9])) {
# Check if the column is numeric
hist(diabetes.complete[[col]],
main=col,
xlab=col)
}
Things are looking good. We can move on to predictions and see what we need to fix.
We can do 10 fold validation using logistic regression to start with. The data is slightly imbalanced but it is around 2 negatives for 1 positive, so we don’t need to worry too much about SMOTE or any other types of oversampling for the positive class.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:colorspace':
##
## coords
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
k = 10
fold.indices <- createDataPartition(diabetes.complete$Outcome, times = k, p = 1/k, list = TRUE) # list of 10 lists of indices for testing
Sanity check on properly balanced data:
par(mfrow = c(2,5))
for (fold in fold.indices){
print('test')
barplot(table(diabetes[fold,9]))
print('train')
barplot(table(diabetes[-fold,9]))
}
## [1] "test"
## [1] "train"
## [1] "test"
## [1] "train"
## [1] "test"
## [1] "train"
## [1] "test"
## [1] "train"
## [1] "test"
## [1] "train"
## [1] "test"
## [1] "train"
## [1] "test"
## [1] "train"
## [1] "test"
## [1] "train"
## [1] "test"
## [1] "train"
## [1] "test"
## [1] "train"
The balancing looks fine. We can continue to training.
stepwise.logistic <- function(test.index) {
train <- diabetes.complete[-test.index,]
test <- diabetes.complete[test.index,]
full.model <- glm(Outcome ~ ., data=train, family = binomial)
null.model <- glm(Outcome ~ 1, data=train, family = binomial)
stepwise <- stepAIC(null.model, scope = list(lower = null.model, upper = full.model), direction = 'both', trace=0)
y.prob <- predict(stepwise, newdata = test, type='response')
y.pred <- ifelse(y.prob > 0.5, '1', '0')
y.pred <- factor(y.pred, levels = c(0, 1))
test$Outcome <- factor(test$Outcome, levels = c(0, 1))
cm <- confusionMatrix(y.pred, reference = test$Outcome, positive = '1', mode='everything')
roc.obj <- roc(test$Outcome, y.prob)
return(list(cm = cm, roc.obj = roc.obj))
}
metrics <- lapply(fold.indices, stepwise.logistic)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
Going through the confusion matrix and AUROC:
auc.list <- list()
# Print confusion matrix and auroc
for (i in seq_along(metrics)) {
cat("=========== Fold", i, "===========\n")
x <- metrics[[i]]
cm <- x[[1]]
roc.obj <- x[[2]]
print(cm)
auc <- auc(roc.obj)
print(auc)
auc.list <- append(auc.list, as.numeric(auc))
cat("\n")
}
## =========== Fold 1 ===========
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 42 7
## 1 6 18
##
## Accuracy : 0.8219
## 95% CI : (0.7147, 0.9016)
## No Information Rate : 0.6575
## P-Value [Acc > NIR] : 0.001492
##
## Kappa : 0.6008
##
## Mcnemar's Test P-Value : 1.000000
##
## Sensitivity : 0.7200
## Specificity : 0.8750
## Pos Pred Value : 0.7500
## Neg Pred Value : 0.8571
## Precision : 0.7500
## Recall : 0.7200
## F1 : 0.7347
## Prevalence : 0.3425
## Detection Rate : 0.2466
## Detection Prevalence : 0.3288
## Balanced Accuracy : 0.7975
##
## 'Positive' Class : 1
##
## Area under the curve: 0.9225
##
## =========== Fold 2 ===========
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 41 7
## 1 7 18
##
## Accuracy : 0.8082
## 95% CI : (0.6992, 0.891)
## No Information Rate : 0.6575
## P-Value [Acc > NIR] : 0.003533
##
## Kappa : 0.5742
##
## Mcnemar's Test P-Value : 1.000000
##
## Sensitivity : 0.7200
## Specificity : 0.8542
## Pos Pred Value : 0.7200
## Neg Pred Value : 0.8542
## Precision : 0.7200
## Recall : 0.7200
## F1 : 0.7200
## Prevalence : 0.3425
## Detection Rate : 0.2466
## Detection Prevalence : 0.3425
## Balanced Accuracy : 0.7871
##
## 'Positive' Class : 1
##
## Area under the curve: 0.9
##
## =========== Fold 3 ===========
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 41 6
## 1 7 19
##
## Accuracy : 0.8219
## 95% CI : (0.7147, 0.9016)
## No Information Rate : 0.6575
## P-Value [Acc > NIR] : 0.001492
##
## Kappa : 0.6083
##
## Mcnemar's Test P-Value : 1.000000
##
## Sensitivity : 0.7600
## Specificity : 0.8542
## Pos Pred Value : 0.7308
## Neg Pred Value : 0.8723
## Precision : 0.7308
## Recall : 0.7600
## F1 : 0.7451
## Prevalence : 0.3425
## Detection Rate : 0.2603
## Detection Prevalence : 0.3562
## Balanced Accuracy : 0.8071
##
## 'Positive' Class : 1
##
## Area under the curve: 0.8975
##
## =========== Fold 4 ===========
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 37 6
## 1 11 19
##
## Accuracy : 0.7671
## 95% CI : (0.6535, 0.8581)
## No Information Rate : 0.6575
## P-Value [Acc > NIR] : 0.02939
##
## Kappa : 0.5066
##
## Mcnemar's Test P-Value : 0.33198
##
## Sensitivity : 0.7600
## Specificity : 0.7708
## Pos Pred Value : 0.6333
## Neg Pred Value : 0.8605
## Precision : 0.6333
## Recall : 0.7600
## F1 : 0.6909
## Prevalence : 0.3425
## Detection Rate : 0.2603
## Detection Prevalence : 0.4110
## Balanced Accuracy : 0.7654
##
## 'Positive' Class : 1
##
## Area under the curve: 0.8908
##
## =========== Fold 5 ===========
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 44 9
## 1 4 16
##
## Accuracy : 0.8219
## 95% CI : (0.7147, 0.9016)
## No Information Rate : 0.6575
## P-Value [Acc > NIR] : 0.001492
##
## Kappa : 0.5847
##
## Mcnemar's Test P-Value : 0.267257
##
## Sensitivity : 0.6400
## Specificity : 0.9167
## Pos Pred Value : 0.8000
## Neg Pred Value : 0.8302
## Precision : 0.8000
## Recall : 0.6400
## F1 : 0.7111
## Prevalence : 0.3425
## Detection Rate : 0.2192
## Detection Prevalence : 0.2740
## Balanced Accuracy : 0.7783
##
## 'Positive' Class : 1
##
## Area under the curve: 0.9275
##
## =========== Fold 6 ===========
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 45 12
## 1 3 13
##
## Accuracy : 0.7945
## 95% CI : (0.6838, 0.8802)
## No Information Rate : 0.6575
## P-Value [Acc > NIR] : 0.007713
##
## Kappa : 0.5007
##
## Mcnemar's Test P-Value : 0.038867
##
## Sensitivity : 0.5200
## Specificity : 0.9375
## Pos Pred Value : 0.8125
## Neg Pred Value : 0.7895
## Precision : 0.8125
## Recall : 0.5200
## F1 : 0.6341
## Prevalence : 0.3425
## Detection Rate : 0.1781
## Detection Prevalence : 0.2192
## Balanced Accuracy : 0.7288
##
## 'Positive' Class : 1
##
## Area under the curve: 0.8675
##
## =========== Fold 7 ===========
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 44 11
## 1 4 14
##
## Accuracy : 0.7945
## 95% CI : (0.6838, 0.8802)
## No Information Rate : 0.6575
## P-Value [Acc > NIR] : 0.007713
##
## Kappa : 0.5109
##
## Mcnemar's Test P-Value : 0.121335
##
## Sensitivity : 0.5600
## Specificity : 0.9167
## Pos Pred Value : 0.7778
## Neg Pred Value : 0.8000
## Precision : 0.7778
## Recall : 0.5600
## F1 : 0.6512
## Prevalence : 0.3425
## Detection Rate : 0.1918
## Detection Prevalence : 0.2466
## Balanced Accuracy : 0.7383
##
## 'Positive' Class : 1
##
## Area under the curve: 0.8875
##
## =========== Fold 8 ===========
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 43 10
## 1 5 15
##
## Accuracy : 0.7945
## 95% CI : (0.6838, 0.8802)
## No Information Rate : 0.6575
## P-Value [Acc > NIR] : 0.007713
##
## Kappa : 0.5208
##
## Mcnemar's Test P-Value : 0.301700
##
## Sensitivity : 0.6000
## Specificity : 0.8958
## Pos Pred Value : 0.7500
## Neg Pred Value : 0.8113
## Precision : 0.7500
## Recall : 0.6000
## F1 : 0.6667
## Prevalence : 0.3425
## Detection Rate : 0.2055
## Detection Prevalence : 0.2740
## Balanced Accuracy : 0.7479
##
## 'Positive' Class : 1
##
## Area under the curve: 0.8525
##
## =========== Fold 9 ===========
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 41 8
## 1 7 17
##
## Accuracy : 0.7945
## 95% CI : (0.6838, 0.8802)
## No Information Rate : 0.6575
## P-Value [Acc > NIR] : 0.007713
##
## Kappa : 0.5393
##
## Mcnemar's Test P-Value : 1.000000
##
## Sensitivity : 0.6800
## Specificity : 0.8542
## Pos Pred Value : 0.7083
## Neg Pred Value : 0.8367
## Precision : 0.7083
## Recall : 0.6800
## F1 : 0.6939
## Prevalence : 0.3425
## Detection Rate : 0.2329
## Detection Prevalence : 0.3288
## Balanced Accuracy : 0.7671
##
## 'Positive' Class : 1
##
## Area under the curve: 0.8742
##
## =========== Fold 10 ===========
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 42 6
## 1 6 19
##
## Accuracy : 0.8356
## 95% CI : (0.7305, 0.9121)
## No Information Rate : 0.6575
## P-Value [Acc > NIR] : 0.0005784
##
## Kappa : 0.635
##
## Mcnemar's Test P-Value : 1.0000000
##
## Sensitivity : 0.7600
## Specificity : 0.8750
## Pos Pred Value : 0.7600
## Neg Pred Value : 0.8750
## Precision : 0.7600
## Recall : 0.7600
## F1 : 0.7600
## Prevalence : 0.3425
## Detection Rate : 0.2603
## Detection Prevalence : 0.3425
## Balanced Accuracy : 0.8175
##
## 'Positive' Class : 1
##
## Area under the curve: 0.8925
The performance is surprisingly pretty good. A range in the 0.8 region is supposed to be very good performance. Here is the average AUROC:
plot(roc.obj)
print(mean(unlist(auc.list)))
## [1] 0.89125
So the average is 0.86, which is pretty good.
Next step would be to do other models. Probably pretty similar to LDA so I don’t think we would really need to do that. Perhaps QDA? This is probably way too small for NB so we can probably skip that. I would also say a KNN classifier just to see, and then either/both ridge and lasso. There is no multicolinearity issues here so lasso would probably be a good choice.
We would also need nice plots, so we can mess around with ggplot2 to compare whatever models we choose in the end.
set.seed(123)
# Scale the entire dataset
scaled_data <- scale(diabetes.complete[, -which(names(diabetes.complete) == "Outcome")])
target <- as.factor(diabetes.complete$Outcome) # Ensure target is a factor for classification
# Cross-validation to find the best k
train_control <- trainControl(method = "cv", number = 10) # 10-fold CV for tuning
knn.tuned <- train(
x = scaled_data,
y = target,
method = "knn",
trControl = train_control,
tuneGrid = data.frame(k = seq(1, 20, by = 1))
)
plot(knn.tuned)
# Extract the best k
best.k <- knn.tuned$bestTune$k
cat("Best k determined from CV:", best.k, "\n")
## Best k determined from CV: 18
# Load required libraries
library(caret)
library(pROC)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.2
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(class)
library(MASS)
library(pls)
## Warning: package 'pls' was built under R version 4.4.2
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:corrplot':
##
## corrplot
## The following object is masked from 'package:stats':
##
## loadings
# Initialize lists to store metrics for each model
#cm.list <- list()
roc.list <- list()
#auc.list <- list()
# Splitting the dataset
set.seed(123) # For reproducibility
fold.indices <- createFolds(diabetes.complete$Outcome, k = 10, list = TRUE)
# Function to train and evaluate models
evaluate_model <- function(model_name, train, test, y.train, y.test) {
if (model_name == "logistic") {
logistic.model <- glm(Outcome ~ ., data=data.frame(train, Outcome=y.train), family=binomial)
print(summary(logistic.model))
y.prob <- predict(logistic.model, newdata=data.frame(test), type="response")
}
if (model_name == "parameter_selection") {
logistic.full <- glm(Outcome ~ ., data=data.frame(train, Outcome=y.train), family=binomial)
logistic.null <- glm(Outcome ~ 1, data=data.frame(train, Outcome=y.train), family=binomial)
logistic.model <- stepAIC(logistic.null, scope=list(lower=logistic.null, upper=logistic.full), direction='both', trace=0)
print(summary(logistic.model))
y.prob <- predict(logistic.model, newdata=data.frame(test), type="response")
} else if (model_name == "qda") {
qda.model <- qda(x = train, grouping = y.train)
print(qda.model)
y.prob <- predict(qda.model, newdata = test)$posterior[, 2]
} else if (model_name == "knn") {
train.scaled <- scale(train)
test.scaled <- scale(test, center = attr(train.scaled, "scaled:center"), scale = attr(train.scaled, "scaled:scale"))
y.prob <- as.numeric(knn(train = train.scaled, test = test.scaled, cl = y.train, k = 10)) - 1
} else if (model_name == "rf") {
rf.model <- randomForest(as.factor(y.train) ~ ., data = data.frame(train, y.train = y.train))
print(importance(rf.model))
y.prob <- predict(rf.model, newdata = data.frame(test), type = "prob")[, 2]
}
y.pred <- ifelse(y.prob > 0.5, '1', '0')
y.pred <- factor(y.pred, levels=c(0,1))
y.test <- factor(y.test, levels=c(0,1))
cm <- confusionMatrix(y.pred, reference=y.test, positive='1', mode='everything')
#print(cm)
roc.obj <- roc(y.test, y.prob)
auc.val <- auc(roc.obj)
return(list(cm = cm, roc.obj = roc.obj, auc.val = auc.val))
}
# Initialize an empty data frame for overall metrics
overall_metrics <- data.frame()
# Loop through models and folds
models <- c("logistic", "parameter_selection", "qda", "knn", "rf")
for (model_name in models) {
cat("Evaluating:", model_name, "\n")
# Initialize storage for fold-specific metrics
fold_metrics <- data.frame(
Accuracy = numeric(),
Sensitivity = numeric(),
Specificity = numeric(),
PosPredValue = numeric(),
NegPredValue = numeric(),
Precision = numeric(),
Recall = numeric(),
F1 = numeric(),
AUC = numeric()
)
model.roc.list <- list()
for (i in seq_along(fold.indices)) {
# Train-test split
test.indices <- fold.indices[[i]]
train.indices <- setdiff(seq_len(nrow(diabetes.complete)), test.indices)
train <- diabetes.complete[train.indices, -which(names(diabetes.complete) == "Outcome")]
test <- diabetes.complete[test.indices, -which(names(diabetes.complete) == "Outcome")]
y.train <- diabetes.complete$Outcome[train.indices]
y.test <- diabetes.complete$Outcome[test.indices]
# Evaluate model
metrics <- evaluate_model(model_name, train, test, y.train, y.test)
model.roc.list[[i]] <- metrics$roc.obj
# Extract metrics
cm <- metrics$cm # Confusion matrix
auc <- metrics$auc.val # AUC
# Store fold metrics
fold_metrics <- rbind(fold_metrics, data.frame(
Accuracy = cm$overall["Accuracy"],
Sensitivity = cm$byClass["Sensitivity"],
Specificity = cm$byClass["Specificity"],
PosPredValue = cm$byClass["Pos Pred Value"],
NegPredValue = cm$byClass["Neg Pred Value"],
Precision = cm$byClass["Precision"],
Recall = cm$byClass["Recall"],
F1 = cm$byClass["F1"],
AUC = auc
))
}
roc.list[[model_name]] <- model.roc.list
# Calculate average metrics for the model
avg_metrics <- colMeans(fold_metrics, na.rm = TRUE)
avg_metrics <- data.frame(Model = model_name, t(avg_metrics)) # Ensure Model column exists
# Append to overall results
overall_metrics <- rbind(overall_metrics, avg_metrics)
}
## Evaluating: logistic
##
## Call:
## glm(formula = Outcome ~ ., family = binomial, data = data.frame(train,
## Outcome = y.train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.0873678 0.1279904 -8.496 < 2e-16 ***
## Pregnancies 0.7387164 0.1478488 4.996 5.84e-07 ***
## Glucose 1.7148882 0.1866627 9.187 < 2e-16 ***
## BloodPressure -0.1032007 0.1365426 -0.756 0.449761
## SkinThickness -0.0455291 0.1583236 -0.288 0.773676
## Insulin 0.1243738 0.1587223 0.784 0.433278
## BMI 0.9783828 0.1812579 5.398 6.75e-08 ***
## DiabetesPedigreeFunction 0.4605264 0.1249877 3.685 0.000229 ***
## Age 0.0005451 0.1475385 0.004 0.997052
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 834.29 on 647 degrees of freedom
## Residual deviance: 458.52 on 639 degrees of freedom
## AIC: 476.52
##
## Number of Fisher Scoring iterations: 6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## glm(formula = Outcome ~ ., family = binomial, data = data.frame(train,
## Outcome = y.train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.15395 0.12987 -8.885 < 2e-16 ***
## Pregnancies 0.65659 0.14759 4.449 8.64e-06 ***
## Glucose 1.74386 0.18706 9.322 < 2e-16 ***
## BloodPressure -0.16038 0.13492 -1.189 0.234542
## SkinThickness -0.14167 0.15692 -0.903 0.366612
## Insulin 0.09651 0.15294 0.631 0.528013
## BMI 1.07268 0.18120 5.920 3.22e-09 ***
## DiabetesPedigreeFunction 0.44564 0.12215 3.648 0.000264 ***
## Age 0.15278 0.14609 1.046 0.295661
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 832.15 on 646 degrees of freedom
## Residual deviance: 457.84 on 638 degrees of freedom
## AIC: 475.84
##
## Number of Fisher Scoring iterations: 6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## glm(formula = Outcome ~ ., family = binomial, data = data.frame(train,
## Outcome = y.train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.11770 0.12859 -8.692 < 2e-16 ***
## Pregnancies 0.65861 0.14500 4.542 5.57e-06 ***
## Glucose 1.76649 0.18745 9.424 < 2e-16 ***
## BloodPressure -0.12192 0.13531 -0.901 0.368
## SkinThickness -0.02204 0.16081 -0.137 0.891
## Insulin 0.06139 0.15407 0.398 0.690
## BMI 1.00287 0.18351 5.465 4.63e-08 ***
## DiabetesPedigreeFunction 0.53110 0.12509 4.246 2.18e-05 ***
## Age 0.07178 0.14461 0.496 0.620
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 831.31 on 645 degrees of freedom
## Residual deviance: 459.63 on 637 degrees of freedom
## AIC: 477.63
##
## Number of Fisher Scoring iterations: 6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## glm(formula = Outcome ~ ., family = binomial, data = data.frame(train,
## Outcome = y.train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.15372 0.13058 -8.835 < 2e-16 ***
## Pregnancies 0.73463 0.14553 5.048 4.47e-07 ***
## Glucose 1.74125 0.18513 9.405 < 2e-16 ***
## BloodPressure -0.06419 0.13882 -0.462 0.644
## SkinThickness -0.15971 0.15924 -1.003 0.316
## Insulin 0.08469 0.15200 0.557 0.577
## BMI 1.05355 0.18156 5.803 6.52e-09 ***
## DiabetesPedigreeFunction 0.50237 0.12517 4.014 5.98e-05 ***
## Age 0.09566 0.14300 0.669 0.504
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 832.15 on 646 degrees of freedom
## Residual deviance: 457.18 on 638 degrees of freedom
## AIC: 475.18
##
## Number of Fisher Scoring iterations: 6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## glm(formula = Outcome ~ ., family = binomial, data = data.frame(train,
## Outcome = y.train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.09377 0.12846 -8.515 < 2e-16 ***
## Pregnancies 0.67005 0.14536 4.610 4.04e-06 ***
## Glucose 1.71798 0.18375 9.350 < 2e-16 ***
## BloodPressure -0.13549 0.13573 -0.998 0.318148
## SkinThickness -0.09931 0.15594 -0.637 0.524222
## Insulin 0.15729 0.15919 0.988 0.323101
## BMI 1.06912 0.18150 5.890 3.85e-09 ***
## DiabetesPedigreeFunction 0.47343 0.12495 3.789 0.000151 ***
## Age 0.10895 0.14636 0.744 0.456609
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 832.15 on 646 degrees of freedom
## Residual deviance: 459.87 on 638 degrees of freedom
## AIC: 477.87
##
## Number of Fisher Scoring iterations: 6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## glm(formula = Outcome ~ ., family = binomial, data = data.frame(train,
## Outcome = y.train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.09575 0.12908 -8.489 < 2e-16 ***
## Pregnancies 0.70953 0.14669 4.837 1.32e-06 ***
## Glucose 1.72571 0.18835 9.162 < 2e-16 ***
## BloodPressure -0.17815 0.13545 -1.315 0.188
## SkinThickness -0.13223 0.15935 -0.830 0.407
## Insulin 0.09732 0.15700 0.620 0.535
## BMI 1.13259 0.18682 6.062 1.34e-09 ***
## DiabetesPedigreeFunction 0.50342 0.12720 3.958 7.57e-05 ***
## Age 0.17898 0.14763 1.212 0.225
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 832.15 on 646 degrees of freedom
## Residual deviance: 452.78 on 638 degrees of freedom
## AIC: 470.78
##
## Number of Fisher Scoring iterations: 6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## glm(formula = Outcome ~ ., family = binomial, data = data.frame(train,
## Outcome = y.train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.0532 0.1264 -8.335 < 2e-16 ***
## Pregnancies 0.7188 0.1456 4.936 7.99e-07 ***
## Glucose 1.7569 0.1896 9.266 < 2e-16 ***
## BloodPressure -0.1958 0.1362 -1.438 0.151
## SkinThickness -0.1649 0.1562 -1.056 0.291
## Insulin 0.1055 0.1541 0.684 0.494
## BMI 1.0763 0.1816 5.925 3.12e-09 ***
## DiabetesPedigreeFunction 0.5461 0.1262 4.326 1.52e-05 ***
## Age 0.1186 0.1452 0.817 0.414
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 834.29 on 647 degrees of freedom
## Residual deviance: 462.32 on 639 degrees of freedom
## AIC: 480.32
##
## Number of Fisher Scoring iterations: 6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## glm(formula = Outcome ~ ., family = binomial, data = data.frame(train,
## Outcome = y.train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.07589 0.12834 -8.383 < 2e-16 ***
## Pregnancies 0.64176 0.14955 4.291 1.78e-05 ***
## Glucose 1.70852 0.18752 9.111 < 2e-16 ***
## BloodPressure -0.20245 0.13762 -1.471 0.141
## SkinThickness -0.05944 0.15920 -0.373 0.709
## Insulin 0.26669 0.16404 1.626 0.104
## BMI 1.01633 0.18251 5.569 2.57e-08 ***
## DiabetesPedigreeFunction 0.53472 0.12488 4.282 1.85e-05 ***
## Age 0.06895 0.14777 0.467 0.641
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 832.15 on 646 degrees of freedom
## Residual deviance: 457.16 on 638 degrees of freedom
## AIC: 475.16
##
## Number of Fisher Scoring iterations: 6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## glm(formula = Outcome ~ ., family = binomial, data = data.frame(train,
## Outcome = y.train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.06712 0.12642 -8.441 < 2e-16 ***
## Pregnancies 0.64169 0.15049 4.264 2.01e-05 ***
## Glucose 1.71860 0.18204 9.441 < 2e-16 ***
## BloodPressure -0.14345 0.13769 -1.042 0.297511
## SkinThickness -0.01212 0.15907 -0.076 0.939258
## Insulin 0.16057 0.15522 1.034 0.300920
## BMI 1.02190 0.18286 5.589 2.29e-08 ***
## DiabetesPedigreeFunction 0.43717 0.12771 3.423 0.000619 ***
## Age -0.01411 0.15242 -0.093 0.926234
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 834.29 on 647 degrees of freedom
## Residual deviance: 459.16 on 639 degrees of freedom
## AIC: 477.16
##
## Number of Fisher Scoring iterations: 6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## glm(formula = Outcome ~ ., family = binomial, data = data.frame(train,
## Outcome = y.train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.10479 0.12792 -8.636 < 2e-16 ***
## Pregnancies 0.70407 0.14690 4.793 1.64e-06 ***
## Glucose 1.73822 0.18731 9.280 < 2e-16 ***
## BloodPressure -0.10679 0.13557 -0.788 0.430865
## SkinThickness -0.06631 0.15779 -0.420 0.674305
## Insulin 0.06002 0.15674 0.383 0.701759
## BMI 1.06946 0.18210 5.873 4.28e-09 ***
## DiabetesPedigreeFunction 0.43853 0.12358 3.548 0.000387 ***
## Age 0.07343 0.14748 0.498 0.618537
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 831.31 on 645 degrees of freedom
## Residual deviance: 461.26 on 637 degrees of freedom
## AIC: 479.26
##
## Number of Fisher Scoring iterations: 6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Evaluating: parameter_selection
##
## Call:
## glm(formula = Outcome ~ Glucose + BMI + Pregnancies + DiabetesPedigreeFunction,
## family = binomial, data = data.frame(train, Outcome = y.train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.0880 0.1271 -8.559 < 2e-16 ***
## Glucose 1.7716 0.1612 10.990 < 2e-16 ***
## BMI 0.9331 0.1382 6.751 1.47e-11 ***
## Pregnancies 0.7099 0.1211 5.864 4.51e-09 ***
## DiabetesPedigreeFunction 0.4609 0.1242 3.712 0.000205 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 834.29 on 647 degrees of freedom
## Residual deviance: 459.83 on 643 degrees of freedom
## AIC: 469.83
##
## Number of Fisher Scoring iterations: 6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## glm(formula = Outcome ~ Glucose + BMI + Pregnancies + DiabetesPedigreeFunction,
## family = binomial, data = data.frame(train, Outcome = y.train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.1458 0.1287 -8.903 < 2e-16 ***
## Glucose 1.7903 0.1627 11.002 < 2e-16 ***
## BMI 0.9361 0.1381 6.780 1.2e-11 ***
## Pregnancies 0.6975 0.1224 5.700 1.2e-08 ***
## DiabetesPedigreeFunction 0.4561 0.1211 3.767 0.000165 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 832.15 on 646 degrees of freedom
## Residual deviance: 461.06 on 642 degrees of freedom
## AIC: 471.06
##
## Number of Fisher Scoring iterations: 6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## glm(formula = Outcome ~ Glucose + BMI + Pregnancies + DiabetesPedigreeFunction,
## family = binomial, data = data.frame(train, Outcome = y.train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.1133 0.1277 -8.719 < 2e-16 ***
## Glucose 1.7926 0.1644 10.901 < 2e-16 ***
## BMI 0.9549 0.1393 6.853 7.22e-12 ***
## Pregnancies 0.6677 0.1204 5.544 2.95e-08 ***
## DiabetesPedigreeFunction 0.5383 0.1243 4.330 1.49e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 831.31 on 645 degrees of freedom
## Residual deviance: 460.74 on 641 degrees of freedom
## AIC: 470.74
##
## Number of Fisher Scoring iterations: 6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## glm(formula = Outcome ~ Glucose + BMI + Pregnancies + DiabetesPedigreeFunction,
## family = binomial, data = data.frame(train, Outcome = y.train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.1497 0.1300 -8.844 < 2e-16 ***
## Glucose 1.7925 0.1627 11.016 < 2e-16 ***
## BMI 0.9372 0.1375 6.815 9.41e-12 ***
## Pregnancies 0.7573 0.1224 6.186 6.18e-10 ***
## DiabetesPedigreeFunction 0.5032 0.1242 4.051 5.10e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 832.15 on 646 degrees of freedom
## Residual deviance: 459.12 on 642 degrees of freedom
## AIC: 469.12
##
## Number of Fisher Scoring iterations: 6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## glm(formula = Outcome ~ Glucose + BMI + Pregnancies + DiabetesPedigreeFunction,
## family = binomial, data = data.frame(train, Outcome = y.train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.0847 0.1271 -8.537 < 2e-16 ***
## Glucose 1.7976 0.1621 11.092 < 2e-16 ***
## BMI 0.9741 0.1384 7.038 1.95e-12 ***
## Pregnancies 0.6885 0.1211 5.686 1.30e-08 ***
## DiabetesPedigreeFunction 0.4772 0.1241 3.844 0.000121 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 832.15 on 646 degrees of freedom
## Residual deviance: 462.74 on 642 degrees of freedom
## AIC: 472.74
##
## Number of Fisher Scoring iterations: 6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## glm(formula = Outcome ~ Glucose + BMI + Pregnancies + DiabetesPedigreeFunction,
## family = binomial, data = data.frame(train, Outcome = y.train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.0890 0.1276 -8.534 < 2e-16 ***
## Glucose 1.7736 0.1628 10.896 < 2e-16 ***
## BMI 1.0023 0.1427 7.022 2.19e-12 ***
## Pregnancies 0.7558 0.1227 6.161 7.24e-10 ***
## DiabetesPedigreeFunction 0.5096 0.1258 4.051 5.11e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 832.15 on 646 degrees of freedom
## Residual deviance: 456.57 on 642 degrees of freedom
## AIC: 466.57
##
## Number of Fisher Scoring iterations: 6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## glm(formula = Outcome ~ Glucose + BMI + Pregnancies + DiabetesPedigreeFunction,
## family = binomial, data = data.frame(train, Outcome = y.train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.0490 0.1253 -8.373 < 2e-16 ***
## Glucose 1.7909 0.1631 10.978 < 2e-16 ***
## BMI 0.9166 0.1372 6.678 2.42e-11 ***
## Pregnancies 0.7188 0.1218 5.902 3.60e-09 ***
## DiabetesPedigreeFunction 0.5492 0.1251 4.391 1.13e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 834.29 on 647 degrees of freedom
## Residual deviance: 466.29 on 643 degrees of freedom
## AIC: 476.29
##
## Number of Fisher Scoring iterations: 6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## glm(formula = Outcome ~ Glucose + BMI + Pregnancies + DiabetesPedigreeFunction +
## Insulin, family = binomial, data = data.frame(train, Outcome = y.train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.0716 0.1271 -8.430 < 2e-16 ***
## Glucose 1.6677 0.1835 9.088 < 2e-16 ***
## BMI 0.9128 0.1399 6.526 6.76e-11 ***
## Pregnancies 0.6319 0.1215 5.199 2.00e-07 ***
## DiabetesPedigreeFunction 0.5341 0.1237 4.316 1.59e-05 ***
## Insulin 0.2702 0.1583 1.707 0.0879 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 832.15 on 646 degrees of freedom
## Residual deviance: 459.44 on 641 degrees of freedom
## AIC: 471.44
##
## Number of Fisher Scoring iterations: 6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## glm(formula = Outcome ~ Glucose + BMI + Pregnancies + DiabetesPedigreeFunction,
## family = binomial, data = data.frame(train, Outcome = y.train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.0628 0.1255 -8.471 < 2e-16 ***
## Glucose 1.7783 0.1594 11.159 < 2e-16 ***
## BMI 0.9939 0.1397 7.112 1.14e-12 ***
## Pregnancies 0.5959 0.1217 4.896 9.78e-07 ***
## DiabetesPedigreeFunction 0.4428 0.1266 3.499 0.000467 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 834.29 on 647 degrees of freedom
## Residual deviance: 461.47 on 643 degrees of freedom
## AIC: 471.47
##
## Number of Fisher Scoring iterations: 6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## glm(formula = Outcome ~ Glucose + BMI + Pregnancies + DiabetesPedigreeFunction,
## family = binomial, data = data.frame(train, Outcome = y.train))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.1010 0.1271 -8.661 < 2e-16 ***
## Glucose 1.7671 0.1625 10.876 < 2e-16 ***
## BMI 1.0029 0.1409 7.119 1.09e-12 ***
## Pregnancies 0.7128 0.1217 5.859 4.65e-09 ***
## DiabetesPedigreeFunction 0.4427 0.1228 3.604 0.000313 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 831.31 on 645 degrees of freedom
## Residual deviance: 462.34 on 641 degrees of freedom
## AIC: 472.34
##
## Number of Fisher Scoring iterations: 6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Evaluating: qda
## Call:
## qda(train, grouping = y.train)
##
## Prior probabilities of groups:
## 0 1
## 0.6558642 0.3441358
##
## Group means:
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 0 -0.1967405 -0.4411929 -0.1329525 -0.1821122 -0.2980743 -0.2598475
## 1 0.3665419 0.8285956 0.2810793 0.4199555 0.5435951 0.5466493
## DiabetesPedigreeFunction Age
## 0 -0.1448124 -0.1802842
## 1 0.2708979 0.3414711
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Call:
## qda(train, grouping = y.train)
##
## Prior probabilities of groups:
## 0 1
## 0.6568779 0.3431221
##
## Group means:
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 0 -0.1669950 -0.4206263 -0.1468499 -0.1888181 -0.2849703 -0.2610227
## 1 0.3842104 0.8367753 0.2852667 0.3805424 0.6043182 0.5238466
## DiabetesPedigreeFunction Age
## 0 -0.1531841 -0.2028205
## 1 0.2790232 0.4058471
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Call:
## qda(train, grouping = y.train)
##
## Prior probabilities of groups:
## 0 1
## 0.6563467 0.3436533
##
## Group means:
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 0 -0.1590024 -0.4243255 -0.1336713 -0.2094993 -0.2743166 -0.2661011
## 1 0.3625169 0.8227994 0.2948741 0.4239258 0.5390850 0.5382060
## DiabetesPedigreeFunction Age
## 0 -0.1630737 -0.1741283
## 1 0.2973477 0.3718069
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Call:
## qda(train, grouping = y.train)
##
## Prior probabilities of groups:
## 0 1
## 0.6568779 0.3431221
##
## Group means:
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 0 -0.2045310 -0.4195273 -0.1532195 -0.1989926 -0.2656612 -0.2481644
## 1 0.3625169 0.8235508 0.2837886 0.3499970 0.5690842 0.5431027
## DiabetesPedigreeFunction Age
## 0 -0.1490467 -0.2073692
## 1 0.2814066 0.3741818
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Call:
## qda(train, grouping = y.train)
##
## Prior probabilities of groups:
## 0 1
## 0.6568779 0.3431221
##
## Group means:
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 0 -0.1903665 -0.4422918 -0.1665378 -0.1971427 -0.2906915 -0.2610573
## 1 0.3516702 0.8152855 0.3052206 0.4203843 0.5427896 0.5168985
## DiabetesPedigreeFunction Age
## 0 -0.1209946 -0.1912422
## 1 0.2636529 0.3915977
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Call:
## qda(train, grouping = y.train)
##
## Prior probabilities of groups:
## 0 1
## 0.6568779 0.3431221
##
## Group means:
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 0 -0.1924912 -0.4211757 -0.1615193 -0.2191104 -0.3060041 -0.3040565
## 1 0.4194623 0.8091241 0.2682688 0.3730167 0.5479841 0.4934734
## DiabetesPedigreeFunction Age
## 0 -0.1646047 -0.2110908
## 1 0.2834617 0.4157425
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Call:
## qda(train, grouping = y.train)
##
## Prior probabilities of groups:
## 0 1
## 0.6558642 0.3441358
##
## Group means:
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 0 -0.1967405 -0.4382884 -0.1211784 -0.2093984 -0.2992522 -0.2705627
## 1 0.3678916 0.7994227 0.2671006 0.3732411 0.5525746 0.4755036
## DiabetesPedigreeFunction Age
## 0 -0.1561138 -0.1910355
## 1 0.2990570 0.3603850
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Call:
## qda(train, grouping = y.train)
##
## Prior probabilities of groups:
## 0 1
## 0.6568779 0.3431221
##
## Group means:
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 0 -0.1839924 -0.4463738 -0.1404803 -0.2110170 -0.3277740 -0.2739156
## 1 0.3191300 0.8295619 0.2704859 0.3947084 0.5841039 0.4812315
## DiabetesPedigreeFunction Age
## 0 -0.1420542 -0.1910355
## 1 0.3162146 0.3627031
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Call:
## qda(train, grouping = y.train)
##
## Prior probabilities of groups:
## 0 1
## 0.6558642 0.3441358
##
## Group means:
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 0 -0.1762019 -0.4377389 -0.1595891 -0.2420031 -0.3026176 -0.3020518
## 1 0.2923050 0.8683904 0.2439253 0.4001239 0.5876906 0.5054770
## DiabetesPedigreeFunction Age
## 0 -0.1471308 -0.1798706
## 1 0.2659395 0.2985206
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Call:
## qda(train, grouping = y.train)
##
## Prior probabilities of groups:
## 0 1
## 0.6563467 0.3436533
##
## Group means:
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 0 -0.2143744 -0.4237747 -0.1479884 -0.2203931 -0.2835090 -0.2629829
## 1 0.3692961 0.8127307 0.2978303 0.4358783 0.5334073 0.5539550
## DiabetesPedigreeFunction Age
## 0 -0.1210719 -0.2099814
## 1 0.2369939 0.3832856
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Evaluating: knn
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Evaluating: rf
## MeanDecreaseGini
## Pregnancies 20.51051
## Glucose 84.69843
## BloodPressure 18.19255
## SkinThickness 24.94045
## Insulin 44.10681
## BMI 43.04542
## DiabetesPedigreeFunction 25.48711
## Age 31.31815
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## MeanDecreaseGini
## Pregnancies 19.75554
## Glucose 83.86747
## BloodPressure 18.29148
## SkinThickness 23.86463
## Insulin 43.50097
## BMI 41.38599
## DiabetesPedigreeFunction 26.37779
## Age 33.86948
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## MeanDecreaseGini
## Pregnancies 19.31894
## Glucose 79.53921
## BloodPressure 18.53995
## SkinThickness 25.54237
## Insulin 43.44014
## BMI 43.18799
## DiabetesPedigreeFunction 28.69080
## Age 32.83673
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## MeanDecreaseGini
## Pregnancies 21.71049
## Glucose 78.43873
## BloodPressure 18.07948
## SkinThickness 24.98879
## Insulin 44.59824
## BMI 40.80738
## DiabetesPedigreeFunction 26.86053
## Age 35.89243
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## MeanDecreaseGini
## Pregnancies 18.93017
## Glucose 81.93000
## BloodPressure 18.81680
## SkinThickness 26.14414
## Insulin 44.24758
## BMI 42.94321
## DiabetesPedigreeFunction 26.23329
## Age 32.03701
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## MeanDecreaseGini
## Pregnancies 22.08650
## Glucose 77.87437
## BloodPressure 17.91613
## SkinThickness 24.91954
## Insulin 45.48783
## BMI 41.06655
## DiabetesPedigreeFunction 26.32934
## Age 35.18427
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## MeanDecreaseGini
## Pregnancies 19.97295
## Glucose 78.06649
## BloodPressure 18.85439
## SkinThickness 25.82720
## Insulin 44.76739
## BMI 42.67120
## DiabetesPedigreeFunction 29.57438
## Age 32.26131
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## MeanDecreaseGini
## Pregnancies 19.02441
## Glucose 80.25213
## BloodPressure 17.23796
## SkinThickness 24.82295
## Insulin 49.75145
## BMI 40.22246
## DiabetesPedigreeFunction 28.23160
## Age 30.44365
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## MeanDecreaseGini
## Pregnancies 17.64777
## Glucose 84.06759
## BloodPressure 19.27762
## SkinThickness 26.22664
## Insulin 46.64149
## BMI 42.94628
## DiabetesPedigreeFunction 26.19732
## Age 29.20901
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## MeanDecreaseGini
## Pregnancies 22.02852
## Glucose 79.38604
## BloodPressure 19.00083
## SkinThickness 27.40586
## Insulin 44.50789
## BMI 43.56313
## DiabetesPedigreeFunction 24.92970
## Age 29.97368
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Convert results to a data frame
overall_metrics <- as.data.frame(overall_metrics)
# Print overall metrics
print(overall_metrics)
## Model Accuracy Sensitivity Specificity PosPredValue
## 1 logistic 0.8260451 0.6848333 0.9003546 0.7904358
## 2 parameter_selection 0.8260842 0.6891667 0.8982270 0.7859368
## 3 qda 0.7995769 0.6440000 0.8812500 0.7478860
## 4 knn 0.7995954 0.5995000 0.9045213 0.7709222
## 5 rf 0.7968948 0.6360000 0.8812500 0.7508198
## NegPredValue Precision Recall F1 AUC
## 1 0.8464904 0.7904358 0.6848333 0.7283963 0.8989663
## 2 0.8485467 0.7859368 0.6891667 0.7285339 0.9023511
## 3 0.8270372 0.7478860 0.6440000 0.6860185 0.8667128
## 4 0.8123533 0.7709222 0.5995000 0.6719934 0.7520106
## 5 0.8232406 0.7508198 0.6360000 0.6821290 0.8925550
# Plot the metrics
library(tidyr)
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
library(ggplot2)
# Reshape data for plotting
metrics_long <- pivot_longer(overall_metrics, cols = -Model, names_to = "Metric", values_to = "Value")
# Create bar plot
ggplot(metrics_long, aes(x = Model, y = as.numeric(Value), fill = Metric)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Model Comparison", x = "Model", y = "Metric Value") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
for (model_name in models) {
for (i in seq_along(roc.list[[model_name]])) {
# Plot the ROC for each fold
plot(
roc.list[[model_name]][[i]],
col = i,
add = i > 1, # Add to the same plot after the first fold
main = paste("ROC for", model_name)
)
}
# Add a legend (optional)
legend("bottomright",
legend = paste("Fold", seq_along(roc.list[[model_name]])),
col = seq_along(roc.list[[model_name]]),
lty = 1,
title = "Folds")
}
library(MASS)
library(ggplot2)
# Subset the relevant predictors and the target variable
data <- diabetes.complete[, c("Glucose", "BMI", "Outcome")]
# Ensure the Outcome is a factor
data$Outcome <- factor(data$Outcome, levels = c(0, 1), labels = c("No Diabetes", "Diabetes"))
# Fit QDA model
qda.model <- qda(Outcome ~ Glucose + BMI, data = data)
# Create a grid of values for Glucose and BMI
glucose_seq <- seq(min(data$Glucose) - 5, max(data$Glucose) + 5, length.out = 200)
bmi_seq <- seq(min(data$BMI) - 5, max(data$BMI) + 5, length.out = 200)
grid <- expand.grid(Glucose = glucose_seq, BMI = bmi_seq)
# Predict class labels for the grid
grid$Outcome <- predict(qda.model, newdata = grid)$class
# Plot the decision boundary with training points
ggplot() +
geom_point(data = grid, aes(x = Glucose, y = BMI, color = Outcome), alpha = 0.3) +
geom_point(data = data, aes(x = Glucose, y = BMI, color = Outcome, shape = Outcome), size = 3) +
labs(title = "QDA Decision Boundary for Glucose vs BMI",
x = "Glucose",
y = "BMI") +
theme_minimal() +
scale_color_manual(values = c("No Diabetes" = "blue", "Diabetes" = "red")) +
scale_shape_manual(values = c("No Diabetes" = 16, "Diabetes" = 17))
library(class)
library(ggplot2)
# Subset the relevant predictors and target variable
data <- diabetes.complete[, c("Glucose", "BMI", "Outcome")]
# Ensure Outcome is a factor
data$Outcome <- factor(data$Outcome, levels = c(0, 1), labels = c("No Diabetes", "Diabetes"))
# Create a grid of values for Glucose and BMI
glucose_seq <- seq(min(data$Glucose) - 5, max(data$Glucose) + 5, length.out = 200)
bmi_seq <- seq(min(data$BMI) - 5, max(data$BMI) + 5, length.out = 200)
grid <- expand.grid(Glucose = glucose_seq, BMI = bmi_seq)
# Prepare training data for kNN
train_data <- scale(data[, c("Glucose", "BMI")]) # Scale predictors
grid_scaled <- scale(grid, center = attr(train_data, "scaled:center"), scale = attr(train_data, "scaled:scale"))
train_labels <- data$Outcome # Class labels
# Predict class labels for the grid using kNN
grid$Outcome <- knn(
train = train_data,
test = grid_scaled,
cl = train_labels,
k = 1
)
# Plot the decision boundary with training points
ggplot() +
geom_point(data = grid, aes(x = Glucose, y = BMI, color = Outcome), alpha = 0.3) +
geom_point(data = data, aes(x = Glucose, y = BMI, color = Outcome, shape = Outcome), size = 3) +
labs(title = "kNN Decision Boundary for Glucose vs BMI (k = 1)",
x = "Glucose",
y = "BMI") +
theme_minimal() +
scale_color_manual(values = c("No Diabetes" = "blue", "Diabetes" = "red")) +
scale_shape_manual(values = c("No Diabetes" = 16, "Diabetes" = 17))
class.plot = function(model = NULL, data, train.index =NULL, method, class = NULL, k = 1,
prob = 0.5, predict_type = "class", train = T, resolution = 100, add = FALSE, ...){
if(is.null(model) & !(method %in% c("knn","Bayes")))return(
"Please type model or select method as knn or Bayes")
if(is.null(method)){return("Please type in method")}
if(method == "naiveBayes" & predict_type != "raw"){
return("Please change predict_type to 'raw'")}
if(!is.null(train.index)){data.tr = data[train.index,]
data.te = data[-train.index,]}else{data.tr = data
data.te = NULL}
if(!is.null(class)) {
cl1 <- data.tr[,class]
cl2 = data.te[,class]
}else {
cl1 <- data.tr[,3]
cl2 = data.te[,3]}
if(abs(nrow(data)-nrow(data.tr))>1){if(length(unique(cl1)) != length(unique(cl2))){
return("training and test sets class numbers do not match")}}
plot.title = paste(k,"-NN classification for ",sep = "")
if(method == "logistic" & length(unique(cl1))<=2){
plot.title = paste("Logistic regression classification with p = ",prob," for ",sep = "")
}else if(method == "logistic" & length(unique(cl1)) > 2){
plot.title = paste("Logistic regression classification for ")
}else if(method == "lda"){plot.title = paste("LDA classification for ")
}else if(method == "qda"){plot.title = paste("QDA classification for ")
}else if(method == "naiveBayes"){plot.title =
paste("Naive Bayes classification for ",sep = "")}
if(!add){
if(train){
plot(data.tr[,1:2], col = as.integer(cl1)+1L, pch = as.integer(cl1)+1L,
main = paste(plot.title, "training data", sep = ""), ...)
} else {
plot(data.te[,1:2], col = as.integer(cl2)+1L, pch = as.integer(cl2)+1L,
main = paste(plot.title,"test data",sep = ""), ...)}
}
r <- sapply(data[,1:2], range, na.rm = TRUE)
xs <- seq(r[1,1], r[2,1], length.out = resolution)
ys <- seq(r[1,2], r[2,2], length.out = resolution)
g <- cbind(rep(xs, each=resolution), rep(ys, time = resolution))
colnames(g) <- colnames(r)
g <- as.data.frame(g)
if(method == "knn"){
p = knn(data.tr[,1:2],g,data.tr[,3], k = k)
}else if(method == "logistic" & length(unique(cl1))<=2){
p = predict(model, g, type = predict_type)
p = ifelse(p>prob,1,0)
}else if(method == "naiveBayes"){
p = predict(model, g, type = predict_type)
p = apply(p,1,which.max)
}else if(method == "Bayes"){
p = model(g)
}else{
p = predict(model, g)
}
if(is.list(p)) p <- p$class
p <- as.factor(p)
z <- matrix(as.integer(p), nrow = resolution, byrow = TRUE)
contour(xs, ys, z, add = TRUE, drawlabels = FALSE,
lwd = 2, levels = (1:(length(unique(data[,3]))-1))+.5, ...)
points(g, col = as.integer(p)+1L, pch = ".")
invisible(z)
}
data <- diabetes.complete[, c("Glucose", "BMI", "Outcome")]
# Ensure Outcome is a factor
data$Outcome <- factor(data$Outcome, levels = c(0, 1), labels = c("No Diabetes", "Diabetes"))
# Use `class.plot` to visualize the decision boundary for kNN
class.plot(
data = data,
train.index = seq_len(nrow(data)), # Use all data for training
method = "knn",
class = "Outcome",
k = 1, # Number of neighbors
resolution = 100
)
# Fit QDA model
qda.model <- qda(Outcome ~ Glucose + BMI, data = data)
# Use `class.plot` to visualize the decision boundary for QDA
class.plot(
model = qda.model, # QDA model
data = data, # Full dataset
train.index = seq_len(nrow(data)), # Use all data for training
method = "qda", # Specify method as QDA
class = "Outcome", # Target variable
resolution = 100 # Resolution of the decision boundary
)
logistic.model <- glm(Outcome ~ ., data=diabetes.complete, family=binomial)
print(summary(logistic.model))
##
## Call:
## glm(formula = Outcome ~ ., family = binomial, data = diabetes.complete)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.09937 0.12153 -9.046 < 2e-16 ***
## Pregnancies 0.68623 0.13902 4.936 7.96e-07 ***
## Glucose 1.73078 0.17651 9.805 < 2e-16 ***
## BloodPressure -0.14171 0.12911 -1.098 0.272
## SkinThickness -0.09050 0.14973 -0.604 0.546
## Insulin 0.11967 0.14802 0.809 0.419
## BMI 1.04866 0.17272 6.071 1.27e-09 ***
## DiabetesPedigreeFunction 0.48664 0.11838 4.111 3.94e-05 ***
## Age 0.08654 0.13877 0.624 0.533
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 925.14 on 718 degrees of freedom
## Residual deviance: 510.47 on 710 degrees of freedom
## AIC: 528.47
##
## Number of Fisher Scoring iterations: 6
rf.model <- randomForest(as.factor(Outcome) ~ ., data = diabetes.complete)
print(importance(rf.model))
## MeanDecreaseGini
## Pregnancies 21.53163
## Glucose 89.33969
## BloodPressure 20.75641
## SkinThickness 28.84184
## Insulin 49.76790
## BMI 47.21837
## DiabetesPedigreeFunction 29.11364
## Age 36.78322