---
title: "Model Evaluation: Diabetic Data Prediction"
author: "Group 12 - Programming For Data Science (WQD7004)"
output:
html_document:
theme: flatly
toc: true
toc_float: true
---
## Load the Models and Data
``` r
# Load the Models
# Classification Models
loaded_logistic_model <- readRDS("logistic_model.rds")
loaded_tree_model <- readRDS("decision_tree_model.rds")
loaded_svm_model <- readRDS("svm_model.rds")
# Regression Models
loaded_lm_model <- readRDS("linear_regression_model.rds")
loaded_rf_model <- readRDS("random_forest_model.rds")
loaded_xgb_model <- readRDS("xgboost_model.rds")
# Load the Data
data <- read.csv("diabetes.csv")
trainIndex <- createDataPartition(data$Outcome, p = 0.8, list = FALSE)
train <- data[trainIndex, ]
test <- data[-trainIndex, ]
# Prepare Classification Test Data
test$Outcome <- factor(test$Outcome, levels = c(0, 1), labels = c("non.diabetic", "diabetic"))
test$Outcome_numeric <- ifelse(test$Outcome == "diabetic", 1, 0) # Numeric outcome for DALEX
# Prepare Regression Test Data
dataReg <- data %>% select(-Outcome)
testData <- dataReg[-trainIndex, ]
We evaluated the Logistic Regression model using the Area Under the Curve (AUC) and ROC performance. The confusion matrix, accuracy, and Kappa statistics for this model are displayed below:
logistic_probs <- predict(loaded_logistic_model, newdata = test, type = "response")
logistic_preds <- ifelse(logistic_probs > 0.5, "diabetic", "non.diabetic")
logistic_conf <- confusionMatrix(factor(logistic_preds, levels = levels(test$Outcome)), test$Outcome)
logistic_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction non.diabetic diabetic
## non.diabetic 94 22
## diabetic 7 30
##
## Accuracy : 0.8105
## 95% CI : (0.7393, 0.8692)
## No Information Rate : 0.6601
## P-Value [Acc > NIR] : 2.948e-05
##
## Kappa : 0.5458
##
## Mcnemar's Test P-Value : 0.00933
##
## Sensitivity : 0.9307
## Specificity : 0.5769
## Pos Pred Value : 0.8103
## Neg Pred Value : 0.8108
## Prevalence : 0.6601
## Detection Rate : 0.6144
## Detection Prevalence : 0.7582
## Balanced Accuracy : 0.7538
##
## 'Positive' Class : non.diabetic
##
The ROC curve for Logistic Regression was plotted to show the model’s ability to differentiate between the two classes (diabetic vs. non-diabetic):
logistic_pred <- prediction(logistic_probs, test$Outcome_numeric)
logistic_perf <- performance(logistic_pred, "tpr", "fpr")
plot(logistic_perf, col = "blue", lty = 1, main = "ROC Curves")
ggsave("logistic_performance.png")
## Saving 7 x 5 in image
Feature importance was evaluated to understand the significance of each feature used by the logistic model:
explainer_logistic <- explain(
model = loaded_logistic_model,
data = test,
y = test$Outcome_numeric,
predict_function = function(m, d) predict(m, newdata = d, type = "response"),
label = "Logistic Regression"
)
## Preparation of a new explainer is initiated
## -> model label : Logistic Regression
## -> data : 153 rows 10 cols
## -> target variable : 153 values
## -> predict function : function(m, d) predict(m, newdata = d, type = "response")
## -> predicted values : No value for predict function target column. ( default )
## -> model_info : package stats , ver. 4.4.1 , task classification ( default )
## -> predicted values : numerical, min = 0.002085536 , mean = 0.3161875 , max = 0.9882122
## -> residual function : difference between y and yhat ( default )
## -> residuals : numerical, min = -0.9553227 , mean = 0.02368182 , max = 0.9798784
## A new explainer has been created!
performance_logistic <- model_performance(explainer_logistic)
plot(performance_logistic)
ggsave("logistic_performance.png")
## Saving 7 x 5 in image
importance_logistic <- model_parts(explainer_logistic)
plot(importance_logistic)
ggsave("logistic_importance.png")
## Saving 7 x 5 in image
For the Decision Tree model, we computed the confusion matrix for the predicted values against the actual outcomes, displaying the accuracy and Kappa values.
tree_preds <- predict(loaded_tree_model, newdata = test, type = "class")
tree_conf <- confusionMatrix(tree_preds, test$Outcome)
tree_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction non.diabetic diabetic
## non.diabetic 92 18
## diabetic 9 34
##
## Accuracy : 0.8235
## 95% CI : (0.7537, 0.8804)
## No Information Rate : 0.6601
## P-Value [Acc > NIR] : 5.393e-06
##
## Kappa : 0.5895
##
## Mcnemar's Test P-Value : 0.1237
##
## Sensitivity : 0.9109
## Specificity : 0.6538
## Pos Pred Value : 0.8364
## Neg Pred Value : 0.7907
## Prevalence : 0.6601
## Detection Rate : 0.6013
## Detection Prevalence : 0.7190
## Balanced Accuracy : 0.7824
##
## 'Positive' Class : non.diabetic
##
For the SVM model, we calculated the confusion matrix and visualized the ROC curve.
svm_preds <- predict(loaded_svm_model, newdata = test)
svm_conf <- confusionMatrix(svm_preds, test$Outcome)
svm_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction non.diabetic diabetic
## non.diabetic 98 25
## diabetic 3 27
##
## Accuracy : 0.817
## 95% CI : (0.7465, 0.8748)
## No Information Rate : 0.6601
## P-Value [Acc > NIR] : 1.287e-05
##
## Kappa : 0.5455
##
## Mcnemar's Test P-Value : 7.229e-05
##
## Sensitivity : 0.9703
## Specificity : 0.5192
## Pos Pred Value : 0.7967
## Neg Pred Value : 0.9000
## Prevalence : 0.6601
## Detection Rate : 0.6405
## Detection Prevalence : 0.8039
## Balanced Accuracy : 0.7448
##
## 'Positive' Class : non.diabetic
##
svm_pred <- prediction(as.numeric(svm_preds), test$Outcome_numeric)
svm_perf <- performance(svm_pred, "tpr", "fpr")
plot.new()
plot(svm_perf, col = "green", lty = 3, add = TRUE, main="SVM ROC Curve")
ggsave("svm_performance.png")
## Saving 7 x 5 in image
The overall accuracy and Kappa statistics for all classification models are summarized in the table below:
classification_summary <- data.frame(
Model = c("Logistic Regression", "Decision Tree", "SVM"),
Accuracy = c(logistic_conf$overall["Accuracy"],
tree_conf$overall["Accuracy"],
svm_conf$overall["Accuracy"]),
Kappa = c(logistic_conf$overall["Kappa"],
tree_conf$overall["Kappa"],
svm_conf$overall["Kappa"])
)
ggplot(classification_summary, aes(x = Model, y = Accuracy)) +
geom_col(fill = "steelblue") +
ggtitle("Classification Model Accuracy") +
theme_minimal()
ggsave("classification_accuracy.png")
## Saving 7 x 5 in image
classification_summary
## Model Accuracy Kappa
## 1 Logistic Regression 0.8104575 0.5458082
## 2 Decision Tree 0.8235294 0.5894862
## 3 SVM 0.8169935 0.5455124
We evaluated the Random Forest regression model using RMSE and R-squared. The results are as follows:
rf_preds <- predict(loaded_rf_model, newdata = testData)
rf_metrics <- data.frame(
RMSE = RMSE(rf_preds, testData$BMI),
Rsquared = R2(rf_preds, testData$BMI)
)
rf_metrics
## RMSE Rsquared
## 1 3.693475 0.7880124
Linear Regression was evaluated using RMSE and R-squared. The metrics for Linear Regression are as follows:
lm_preds <- predict(loaded_lm_model, newdata = testData)
lm_metrics <- data.frame(
RMSE = RMSE(lm_preds, testData$BMI),
Rsquared = R2(lm_preds, testData$BMI)
)
lm_metrics
## RMSE Rsquared
## 1 6.318576 0.2970557
The XGBoost regression model’s performance metrics are displayed below, including RMSE and R-squared:
xgb_preds <- predict(loaded_xgb_model, newdata = testData)
xgb_metrics <- data.frame(
RMSE = RMSE(xgb_preds, testData$BMI),
Rsquared = R2(xgb_preds, testData$BMI)
)
xgb_metrics
## RMSE Rsquared
## 1 2.844462 0.8573595
The performance of all regression models is summarized in the bar plot below:
regression_metrics <- rbind(
Random_Forest = rf_metrics,
Linear_Regression = lm_metrics,
XGBoost = xgb_metrics
)
barplot(as.matrix(regression_metrics), beside = TRUE, col = c("blue", "red", "green"),
main = "Regression Model Performance", legend = rownames(regression_metrics))
ggsave("regression_performance.png")
## Saving 7 x 5 in image
The final regression metrics for all models are summarized here:
regression_metrics
## RMSE Rsquared
## Random_Forest 3.693475 0.7880124
## Linear_Regression 6.318576 0.2970557
## XGBoost 2.844462 0.8573595
This document provided an evaluation of various classification and regression models to predict the likelihood of diabetes occurrence and its progression using BMI. We calculated confusion matrices, AUC, ROC curves, feature importance, and performance metrics such as RMSE and R-squared.
# Additional insights or final remarks can be added here
To display the saved PNG images in the final report, use the
following code to call the images from the saved .png
files:
# Display the saved ROC curve plot for logistic regression
knitr::include_graphics("logistic_performance.png")
# Display the saved feature importance plot for logistic regression
knitr::include_graphics("logistic_importance.png")
# Display the saved ROC curve for SVM
knitr::include_graphics("svm_performance.png")
# Display the saved regression performance plot
knitr::include_graphics("regression_performance.png")