library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.1
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 4.5.2
library(corrplot)
## corrplot 0.95 loaded
library(GGally)
## Warning: package 'GGally' was built under R version 4.5.2
library(dplyr)
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
** Conduct outlier and missing data analysis
# import dataset
diabetes <- read.csv("ObesityDataSet_raw_and_data_sinthetic.csv")
head(diabetes)
## Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female 21 1.62 64.0 yes no 2 3
## 2 Female 21 1.52 56.0 yes no 3 3
## 3 Male 23 1.80 77.0 yes no 2 3
## 4 Male 27 1.80 87.0 no no 3 3
## 5 Male 22 1.78 89.8 no no 2 1
## 6 Male 29 1.62 53.0 no yes 2 3
## CAEC SMOKE CH2O SCC FAF TUE CALC MTRANS
## 1 Sometimes no 2 no 0 1 no Public_Transportation
## 2 Sometimes yes 3 yes 3 0 Sometimes Public_Transportation
## 3 Sometimes no 2 no 2 1 Frequently Public_Transportation
## 4 Sometimes no 2 no 2 0 Frequently Walking
## 5 Sometimes no 2 no 0 0 Sometimes Public_Transportation
## 6 Sometimes no 2 no 0 0 Sometimes Automobile
## NObeyesdad
## 1 Normal_Weight
## 2 Normal_Weight
## 3 Normal_Weight
## 4 Overweight_Level_I
## 5 Overweight_Level_II
## 6 Normal_Weight
colSums(is.na(diabetes))
## Gender Age
## 0 0
## Height Weight
## 0 0
## family_history_with_overweight FAVC
## 0 0
## FCVC NCP
## 0 0
## CAEC SMOKE
## 0 0
## CH2O SCC
## 0 0
## FAF TUE
## 0 0
## CALC MTRANS
## 0 0
## NObeyesdad
## 0
library(DataExplorer)
plot_missing(diabetes)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the DataExplorer package.
## Please report the issue at
## <https://github.com/boxuancui/DataExplorer/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
str(diabetes)
## 'data.frame': 2111 obs. of 17 variables:
## $ Gender : chr "Female" "Female" "Male" "Male" ...
## $ Age : num 21 21 23 27 22 29 23 22 24 22 ...
## $ Height : num 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
## $ Weight : num 64 56 77 87 89.8 53 55 53 64 68 ...
## $ family_history_with_overweight: chr "yes" "yes" "yes" "no" ...
## $ FAVC : chr "no" "no" "no" "no" ...
## $ FCVC : num 2 3 2 3 2 2 3 2 3 2 ...
## $ NCP : num 3 3 3 3 1 3 3 3 3 3 ...
## $ CAEC : chr "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
## $ SMOKE : chr "no" "yes" "no" "no" ...
## $ CH2O : num 2 3 2 2 2 2 2 2 2 2 ...
## $ SCC : chr "no" "yes" "no" "no" ...
## $ FAF : num 0 3 2 2 0 0 1 3 1 1 ...
## $ TUE : num 1 0 1 0 0 0 0 0 1 1 ...
## $ CALC : chr "no" "Sometimes" "Frequently" "Frequently" ...
## $ MTRANS : chr "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
## $ NObeyesdad : chr "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...
# check outliers
boxplot(diabetes$Age, main = "Age")
boxplot(diabetes$Height, main = "Height")
boxplot(diabetes$Weight, main = "Weight")
** Visualize data
# Single-variable visualizations
library(tidyverse)
num_vars <- diabetes %>% select_if(is.numeric)
num_cols <- names(num_vars)
for (col in num_cols){
graph <- ggplot(diabetes, aes(x = .data[[col]])) +
geom_histogram(fill = "skyblue", color = "black", bins = 30) +
ggtitle(paste("Histogram of", col)) +
theme_minimal()
print(graph)
}
# Bivariate visualizations
ggplot(diabetes, aes(x = Gender, y = Weight, fill = Gender))+
geom_boxplot() +
ggtitle("Weight Distribution by Gender")+
theme_minimal()
ggplot(diabetes, aes(x = Height, y = Weight, color = Gender)) +
geom_point(alpha = 0.6) +
ggtitle("Height vs Weight by Gender") +
theme_minimal()
# Correlation matrix
library(corrplot)
cor_matrix <- cor(num_vars)
corrplot(cor_matrix, method = "color", type = "upper", tl.col = "black", tl.cex = 0.8)
** features analysis
# data preprocessing
# 1. set up training and test
X <- diabetes %>% select(-NObeyesdad)
y <- as.factor(diabetes$NObeyesdad)
X <- X %>% mutate(across(where(is.character), as.factor))
# one-hot encoding
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
dummy <- dummyVars(~ ., data = X, fullRank = TRUE)
X_encoded <- data.frame(predict(dummy, newdata = X, na.action = na.pass))
# Normalization
preProc <- preProcess(X_encoded, method = c('center', 'scale'))
X_scaled <- predict(preProc, X_encoded)
# split dataset into training and test
set.seed(123)
trainIndex <- createDataPartition(y, p = 0.8, list =FALSE)
X_train <- X_scaled[trainIndex,, drop = FALSE]
y_train <- y[trainIndex]
X_test <- X_scaled[-trainIndex, , drop = FALSE]
y_test <- y[-trainIndex]
cat("Train: ", nrow(X_train), "observations\n")
## Train: 1691 observations
cat("Test: ", nrow(X_test), "observations\n")
## Test: 420 observations
table(y_train)
## y_train
## Insufficient_Weight Normal_Weight Obesity_Type_I Obesity_Type_II
## 218 230 281 238
## Obesity_Type_III Overweight_Level_I Overweight_Level_II
## 260 232 232
table(y_test)
## y_test
## Insufficient_Weight Normal_Weight Obesity_Type_I Obesity_Type_II
## 54 57 70 59
## Obesity_Type_III Overweight_Level_I Overweight_Level_II
## 64 58 58
set.seed(123)
trCtrl <- trainControl(method = "cv",
number = 5,
classProbs = FALSE,
summaryFunction = defaultSummary,
savePredictions = "final")
# random forest
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
set.seed(123)
rf_fit <- train(x = X_train, y = y_train,
method = "rf",
metric = "Accuracy",
trControl = trCtrl,
tuneLength =5)
rf_pred <- predict(rf_fit, X_test)
rf_cm <- confusionMatrix(rf_pred, y_test)
rf_acc <- rf_cm$overall["Accuracy"]
cat("Random Forest Accuracy:", round(as.numeric(rf_acc), 4), "\n")
## Random Forest Accuracy: 0.969
print(rf_cm$table)
## Reference
## Prediction Insufficient_Weight Normal_Weight Obesity_Type_I
## Insufficient_Weight 51 1 0
## Normal_Weight 3 56 0
## Obesity_Type_I 0 0 68
## Obesity_Type_II 0 0 0
## Obesity_Type_III 0 0 0
## Overweight_Level_I 0 0 0
## Overweight_Level_II 0 0 2
## Reference
## Prediction Obesity_Type_II Obesity_Type_III Overweight_Level_I
## Insufficient_Weight 0 0 0
## Normal_Weight 0 0 3
## Obesity_Type_I 0 0 0
## Obesity_Type_II 59 0 0
## Obesity_Type_III 0 64 0
## Overweight_Level_I 0 0 52
## Overweight_Level_II 0 0 3
## Reference
## Prediction Overweight_Level_II
## Insufficient_Weight 0
## Normal_Weight 0
## Obesity_Type_I 1
## Obesity_Type_II 0
## Obesity_Type_III 0
## Overweight_Level_I 0
## Overweight_Level_II 57
# evaluation metrics
library(caret)
pred_rf <- predict(rf_fit, X_test)
cm_rf <- confusionMatrix(pred_rf, y_test)
cat("\n==============================\n")
##
## ==============================
cat("Evaluation of Random Forest Model\n")
## Evaluation of Random Forest Model
cat("==============================\n")
## ==============================
cm_rf$overall
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.690476e-01 9.638504e-01 9.476520e-01 9.834183e-01 1.666667e-01
## AccuracyPValue McnemarPValue
## 3.107087e-294 NaN
cm_rf$byClass
## Sensitivity Specificity Pos Pred Value
## Class: Insufficient_Weight 0.9444444 0.9972678 0.9807692
## Class: Normal_Weight 0.9824561 0.9834711 0.9032258
## Class: Obesity_Type_I 0.9714286 0.9971429 0.9855072
## Class: Obesity_Type_II 1.0000000 1.0000000 1.0000000
## Class: Obesity_Type_III 1.0000000 1.0000000 1.0000000
## Class: Overweight_Level_I 0.8965517 1.0000000 1.0000000
## Class: Overweight_Level_II 0.9827586 0.9861878 0.9193548
## Neg Pred Value Precision Recall F1
## Class: Insufficient_Weight 0.9918478 0.9807692 0.9444444 0.9622642
## Class: Normal_Weight 0.9972067 0.9032258 0.9824561 0.9411765
## Class: Obesity_Type_I 0.9943020 0.9855072 0.9714286 0.9784173
## Class: Obesity_Type_II 1.0000000 1.0000000 1.0000000 1.0000000
## Class: Obesity_Type_III 1.0000000 1.0000000 1.0000000 1.0000000
## Class: Overweight_Level_I 0.9836957 1.0000000 0.8965517 0.9454545
## Class: Overweight_Level_II 0.9972067 0.9193548 0.9827586 0.9500000
## Prevalence Detection Rate Detection Prevalence
## Class: Insufficient_Weight 0.1285714 0.1214286 0.1238095
## Class: Normal_Weight 0.1357143 0.1333333 0.1476190
## Class: Obesity_Type_I 0.1666667 0.1619048 0.1642857
## Class: Obesity_Type_II 0.1404762 0.1404762 0.1404762
## Class: Obesity_Type_III 0.1523810 0.1523810 0.1523810
## Class: Overweight_Level_I 0.1380952 0.1238095 0.1238095
## Class: Overweight_Level_II 0.1380952 0.1357143 0.1476190
## Balanced Accuracy
## Class: Insufficient_Weight 0.9708561
## Class: Normal_Weight 0.9829636
## Class: Obesity_Type_I 0.9842857
## Class: Obesity_Type_II 1.0000000
## Class: Obesity_Type_III 1.0000000
## Class: Overweight_Level_I 0.9482759
## Class: Overweight_Level_II 0.9844732
# decision tree
set.seed(123)
rpart_fit <- train(x = X_train, y = y_train,
method = "rpart",
metric = "Accuracy",
trControl = trCtrl,
tuneLength = 10)
rpart_pred <- predict(rpart_fit, X_test)
rpart_cm <- confusionMatrix(rpart_pred, y_test)
rpart_acc <- rpart_cm$overall["Accuracy"]
cat("Decision Tree Accuracy:", round(as.numeric(rpart_acc), 4), "\n")
## Decision Tree Accuracy: 0.8214
print(rpart_cm$table)
## Reference
## Prediction Insufficient_Weight Normal_Weight Obesity_Type_I
## Insufficient_Weight 49 6 0
## Normal_Weight 5 33 0
## Obesity_Type_I 0 0 59
## Obesity_Type_II 0 0 7
## Obesity_Type_III 0 0 0
## Overweight_Level_I 0 18 2
## Overweight_Level_II 0 0 2
## Reference
## Prediction Obesity_Type_II Obesity_Type_III Overweight_Level_I
## Insufficient_Weight 0 0 0
## Normal_Weight 0 0 1
## Obesity_Type_I 6 0 0
## Obesity_Type_II 53 0 0
## Obesity_Type_III 0 64 0
## Overweight_Level_I 0 0 55
## Overweight_Level_II 0 0 2
## Reference
## Prediction Overweight_Level_II
## Insufficient_Weight 0
## Normal_Weight 0
## Obesity_Type_I 10
## Obesity_Type_II 1
## Obesity_Type_III 0
## Overweight_Level_I 15
## Overweight_Level_II 32
# evaluation metrics
pred_rpart <- predict(rpart_fit, X_test)
cm_rpart <- confusionMatrix(pred_rpart, y_test)
cat("\n==============================\n")
##
## ==============================
cat("Evaluation of Decision Tree Model\n")
## Evaluation of Decision Tree Model
cat("==============================\n")
## ==============================
cm_rpart$overall
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.214286e-01 7.913272e-01 7.813832e-01 8.568725e-01 1.666667e-01
## AccuracyPValue McnemarPValue
## 8.157771e-191 NaN
cm_rpart$byClass
## Sensitivity Specificity Pos Pred Value
## Class: Insufficient_Weight 0.9074074 0.9836066 0.8909091
## Class: Normal_Weight 0.5789474 0.9834711 0.8461538
## Class: Obesity_Type_I 0.8428571 0.9542857 0.7866667
## Class: Obesity_Type_II 0.8983051 0.9778393 0.8688525
## Class: Obesity_Type_III 1.0000000 1.0000000 1.0000000
## Class: Overweight_Level_I 0.9482759 0.9033149 0.6111111
## Class: Overweight_Level_II 0.5517241 0.9889503 0.8888889
## Neg Pred Value Precision Recall F1
## Class: Insufficient_Weight 0.9863014 0.8909091 0.9074074 0.8990826
## Class: Normal_Weight 0.9370079 0.8461538 0.5789474 0.6875000
## Class: Obesity_Type_I 0.9681159 0.7866667 0.8428571 0.8137931
## Class: Obesity_Type_II 0.9832869 0.8688525 0.8983051 0.8833333
## Class: Obesity_Type_III 1.0000000 1.0000000 1.0000000 1.0000000
## Class: Overweight_Level_I 0.9909091 0.6111111 0.9482759 0.7432432
## Class: Overweight_Level_II 0.9322917 0.8888889 0.5517241 0.6808511
## Prevalence Detection Rate Detection Prevalence
## Class: Insufficient_Weight 0.1285714 0.11666667 0.13095238
## Class: Normal_Weight 0.1357143 0.07857143 0.09285714
## Class: Obesity_Type_I 0.1666667 0.14047619 0.17857143
## Class: Obesity_Type_II 0.1404762 0.12619048 0.14523810
## Class: Obesity_Type_III 0.1523810 0.15238095 0.15238095
## Class: Overweight_Level_I 0.1380952 0.13095238 0.21428571
## Class: Overweight_Level_II 0.1380952 0.07619048 0.08571429
## Balanced Accuracy
## Class: Insufficient_Weight 0.9455070
## Class: Normal_Weight 0.7812092
## Class: Obesity_Type_I 0.8985714
## Class: Obesity_Type_II 0.9380722
## Class: Obesity_Type_III 1.0000000
## Class: Overweight_Level_I 0.9257954
## Class: Overweight_Level_II 0.7703372
# logistic regression
set.seed(123)
multi_fit <- train(x = X_train, y = y_train,
method = "multinom",
metric = "Accuracy",
trControl = trCtrl,
trace = FALSE)
multi_pred <- predict(multi_fit, X_test)
multi_cm <- confusionMatrix(multi_pred, y_test)
multi_acc <- multi_cm$overall["Accuracy"]
cat("Multinomial Logistic Accuracy:", round(as.numeric(multi_acc), 4), "\n")
## Multinomial Logistic Accuracy: 0.969
print(multi_cm$table)
## Reference
## Prediction Insufficient_Weight Normal_Weight Obesity_Type_I
## Insufficient_Weight 53 2 0
## Normal_Weight 1 52 0
## Obesity_Type_I 0 0 67
## Obesity_Type_II 0 0 0
## Obesity_Type_III 0 0 2
## Overweight_Level_I 0 3 0
## Overweight_Level_II 0 0 1
## Reference
## Prediction Obesity_Type_II Obesity_Type_III Overweight_Level_I
## Insufficient_Weight 0 0 0
## Normal_Weight 0 0 1
## Obesity_Type_I 0 0 0
## Obesity_Type_II 59 0 0
## Obesity_Type_III 0 64 0
## Overweight_Level_I 0 0 56
## Overweight_Level_II 0 0 1
## Reference
## Prediction Overweight_Level_II
## Insufficient_Weight 0
## Normal_Weight 0
## Obesity_Type_I 0
## Obesity_Type_II 0
## Obesity_Type_III 0
## Overweight_Level_I 2
## Overweight_Level_II 56
# evaluation metrics
pred_multi <- predict(multi_fit, X_test)
cm_multi <- confusionMatrix(pred_multi, y_test)
cat("\n==============================\n")
##
## ==============================
cat("Evaluation of Multinomial Logistic Model\n")
## Evaluation of Multinomial Logistic Model
cat("==============================\n")
## ==============================
cm_multi$overall
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.690476e-01 9.638542e-01 9.476520e-01 9.834183e-01 1.666667e-01
## AccuracyPValue McnemarPValue
## 3.107087e-294 NaN
cm_multi$byClass
## Sensitivity Specificity Pos Pred Value
## Class: Insufficient_Weight 0.9814815 0.9945355 0.9636364
## Class: Normal_Weight 0.9122807 0.9944904 0.9629630
## Class: Obesity_Type_I 0.9571429 1.0000000 1.0000000
## Class: Obesity_Type_II 1.0000000 1.0000000 1.0000000
## Class: Obesity_Type_III 1.0000000 0.9943820 0.9696970
## Class: Overweight_Level_I 0.9655172 0.9861878 0.9180328
## Class: Overweight_Level_II 0.9655172 0.9944751 0.9655172
## Neg Pred Value Precision Recall F1
## Class: Insufficient_Weight 0.9972603 0.9636364 0.9814815 0.9724771
## Class: Normal_Weight 0.9863388 0.9629630 0.9122807 0.9369369
## Class: Obesity_Type_I 0.9915014 1.0000000 0.9571429 0.9781022
## Class: Obesity_Type_II 1.0000000 1.0000000 1.0000000 1.0000000
## Class: Obesity_Type_III 1.0000000 0.9696970 1.0000000 0.9846154
## Class: Overweight_Level_I 0.9944290 0.9180328 0.9655172 0.9411765
## Class: Overweight_Level_II 0.9944751 0.9655172 0.9655172 0.9655172
## Prevalence Detection Rate Detection Prevalence
## Class: Insufficient_Weight 0.1285714 0.1261905 0.1309524
## Class: Normal_Weight 0.1357143 0.1238095 0.1285714
## Class: Obesity_Type_I 0.1666667 0.1595238 0.1595238
## Class: Obesity_Type_II 0.1404762 0.1404762 0.1404762
## Class: Obesity_Type_III 0.1523810 0.1523810 0.1571429
## Class: Overweight_Level_I 0.1380952 0.1333333 0.1452381
## Class: Overweight_Level_II 0.1380952 0.1333333 0.1380952
## Balanced Accuracy
## Class: Insufficient_Weight 0.9880085
## Class: Normal_Weight 0.9533855
## Class: Obesity_Type_I 0.9785714
## Class: Obesity_Type_II 1.0000000
## Class: Obesity_Type_III 0.9971910
## Class: Overweight_Level_I 0.9758525
## Class: Overweight_Level_II 0.9799962
# SVM
set.seed(123)
svm_fit <- train(x = X_train, y = y_train,
method = "svmRadial",
metric = "Accuracy",
trControl = trCtrl,
tuneLength = 5)
svm_pred <- predict(svm_fit, X_test)
svm_cm <- confusionMatrix(svm_pred, y_test)
svm_acc <- svm_cm$overall["Accuracy"]
cat("SVM Accuracy:", round(as.numeric(svm_acc), 4), "\n")
## SVM Accuracy: 0.8786
print(svm_cm$table)
## Reference
## Prediction Insufficient_Weight Normal_Weight Obesity_Type_I
## Insufficient_Weight 47 5 0
## Normal_Weight 7 40 4
## Obesity_Type_I 0 0 63
## Obesity_Type_II 0 0 0
## Obesity_Type_III 0 0 0
## Overweight_Level_I 0 11 1
## Overweight_Level_II 0 1 2
## Reference
## Prediction Obesity_Type_II Obesity_Type_III Overweight_Level_I
## Insufficient_Weight 0 0 0
## Normal_Weight 1 0 4
## Obesity_Type_I 0 0 0
## Obesity_Type_II 58 0 0
## Obesity_Type_III 0 64 0
## Overweight_Level_I 0 0 51
## Overweight_Level_II 0 0 3
## Reference
## Prediction Overweight_Level_II
## Insufficient_Weight 0
## Normal_Weight 3
## Obesity_Type_I 4
## Obesity_Type_II 1
## Obesity_Type_III 0
## Overweight_Level_I 4
## Overweight_Level_II 46
# evaluation metrics
pred_svm <- predict(svm_fit, X_test)
cm_svm <- confusionMatrix(pred_svm, y_test)
cat("\n==============================\n")
##
## ==============================
cat("Evaluation of SVM Model\n")
## Evaluation of SVM Model
cat("==============================\n")
## ==============================
cm_svm$overall
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.785714e-01 8.582020e-01 8.434492e-01 9.082373e-01 1.666667e-01
## AccuracyPValue McnemarPValue
## 1.138903e-225 NaN
cm_svm$byClass
## Sensitivity Specificity Pos Pred Value
## Class: Insufficient_Weight 0.8703704 0.9863388 0.9038462
## Class: Normal_Weight 0.7017544 0.9476584 0.6779661
## Class: Obesity_Type_I 0.9000000 0.9885714 0.9402985
## Class: Obesity_Type_II 0.9830508 0.9972299 0.9830508
## Class: Obesity_Type_III 1.0000000 1.0000000 1.0000000
## Class: Overweight_Level_I 0.8793103 0.9558011 0.7611940
## Class: Overweight_Level_II 0.7931034 0.9834254 0.8846154
## Neg Pred Value Precision Recall F1
## Class: Insufficient_Weight 0.9809783 0.9038462 0.8703704 0.8867925
## Class: Normal_Weight 0.9529086 0.6779661 0.7017544 0.6896552
## Class: Obesity_Type_I 0.9801700 0.9402985 0.9000000 0.9197080
## Class: Obesity_Type_II 0.9972299 0.9830508 0.9830508 0.9830508
## Class: Obesity_Type_III 1.0000000 1.0000000 1.0000000 1.0000000
## Class: Overweight_Level_I 0.9801700 0.7611940 0.8793103 0.8160000
## Class: Overweight_Level_II 0.9673913 0.8846154 0.7931034 0.8363636
## Prevalence Detection Rate Detection Prevalence
## Class: Insufficient_Weight 0.1285714 0.1119048 0.1238095
## Class: Normal_Weight 0.1357143 0.0952381 0.1404762
## Class: Obesity_Type_I 0.1666667 0.1500000 0.1595238
## Class: Obesity_Type_II 0.1404762 0.1380952 0.1404762
## Class: Obesity_Type_III 0.1523810 0.1523810 0.1523810
## Class: Overweight_Level_I 0.1380952 0.1214286 0.1595238
## Class: Overweight_Level_II 0.1380952 0.1095238 0.1238095
## Balanced Accuracy
## Class: Insufficient_Weight 0.9283546
## Class: Normal_Weight 0.8247064
## Class: Obesity_Type_I 0.9442857
## Class: Obesity_Type_II 0.9901404
## Class: Obesity_Type_III 1.0000000
## Class: Overweight_Level_I 0.9175557
## Class: Overweight_Level_II 0.8882644
** Describe the rational behind choosing that particular method
This dataset represents a multiclass classification problem, so several classification algorithms were selected to compare model performance and interpretability.
Random Forest: Several features in the dataset, such as FAVC, CAEC, and FAF, exhibit nonlinear relationships with the target variable. Random Forest is well-suited for modeling nonlinear and high-dimensional data, as it combines multiple decision trees to improve prediction stability and accuracy. Compared with a single decision tree, it generally provides higher accuracy and better generalization.
Decision Tree:Since approximately half of the features are categorical variables, the Decision Tree model is an appropriate choice. It is intuitive, easy to interpret, and provides a clear visualization of the relationships between predictors and the target variable. This model helps to understand the key decision rules and overall data structure.
Multinomial Logistic Regression: Logistic regression serves as a baseline model for classification tasks. This dataset includes several features with linear relationships—such as Height, Weight, and Age—making this model particularly suitable. In addition, it provides interpretable coefficients that explain the direction and strength of each predictor’s influence on obesity levels.
Support Vector Machine(SVM): The SVM model performs well with medium-sized datasets and a moderate number of features. With 2,111 observations and 16 predictors, this dataset fits those conditions. SVM can effectively handle nonlinear class boundaries through the use of kernel functions, making it a valuable choice for testing alternative model performance.
** Detail the process of tuning parameters and validating the model’s performance
Random Forest: The Random Forest model was trained using the caret package with method = “rf”. The parameter mtry (the number of variables randomly sampled at each split) was tuned using 5-fold cross-validation to identify the optimal configuration that maximized model accuracy.
Decision Tree:The Decision Tree model applied the complexity parameter (cp) to control tree growth and prevent overfitting. By adjusting cp, the model balances accuracy and generalization, ensuring a simpler and more interpretable tree structure.
Multinomial Logistic Regression: The Multinomial Logistic Regression model was validated using 5-fold cross-validation to select the most accurate model and avoid overfitting. This approach ensures that the model’s performance is consistent across different subsets of the data.
Support Vector Machine(SVM):Prior to model training, the data were normalized (centered and scaled) to ensure that all features contributed equally to the distance calculations in the SVM algorithm. This preprocessing step improves the model’s convergence and classification accuracy.
** Results & Interpretation
Random Forest: It achieved an accuracy of 0.97, a Kappa value of approximately 0.96, an F1-score around 0.95, and a recall close to 1.0. These results indicate that the Random Forest model provides highly accurate and consistent predictions across all obesity categories. Its strong performance suggests that it is the most suitable model for this dataset, effectively capturing the nonlinear relationships among the features.
Decision Tree: It reached an accuracy of about 0.82, with F1-scores ranging from 0.68 to 0.92. While the model offers clear interpretability and helps visualize decision-making patterns, its predictive accuracy is notably lower than that of the Random Forest model. Thus, it is valuable for explanation and feature understanding but less effective for high-precision prediction.
Multinomial Logistic Regression: It achieved an accuracy of 0.97 and a Kappa value of approximately 0.96. Its precision, recall, and F1-scores all ranged between 0.96 and 1.00, demonstrating excellent classification performance. The model effectively distinguishes between different weight categories and provides clear insights into the linear relationships between the target variable and predictors such as Age, Weight, FAF, and TUE.
Support Vector Machine(SVM): obtained an accuracy of 0.88 and a Kappa value of 0.85, with recall and F1-scores ranging from 0.79 to 0.93. Compared with the other models, its performance is relatively weaker. Although SVM can capture nonlinear decision boundaries, it appears less suitable for this dataset, likely due to the mixture of categorical and numerical variables and the relatively high number of classes.
** This project was completed entirely by myself. ChatGPT was utilized only for spelling, grammar correction, and improving the clarity of written expression.