# --- Load All Required Libraries ---
# Core data manipulation
library(tidyverse) # includes ggplot2, dplyr, readr, etc.
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(caret) # train/test splitting, ML tools
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.3.3
##
## Attaching package: 'Metrics'
##
## The following objects are masked from 'package:caret':
##
## precision, recall
# Plotting
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.3.3
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.3.3
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.3.2
library(GGally)
## Warning: package 'GGally' was built under R version 4.3.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.3.3
# Modeling
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.3.3
##
## Attaching package: 'xgboost'
##
## The following object is masked from 'package:dplyr':
##
## slice
library(rpart)
library(rpart.plot)
# Advanced tools
library(SHAPforxgboost)
## Warning: package 'SHAPforxgboost' was built under R version 4.3.3
library(knitr) # table formatting for outputs
library(lindia) # linear model diagnostics
## Warning: package 'lindia' was built under R version 4.3.3
#########################
# Data Pre Processing
#########################
# Load dataset
obesity <- read_csv("ObesityDataSet_raw_and_data_sinthetic.csv")
## Rows: 2111 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Gender, family_history_with_overweight, FAVC, CAEC, SMOKE, SCC, CAL...
## dbl (8): Age, Height, Weight, FCVC, NCP, CH2O, FAF, TUE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Clean and encode dataset
final_data <- obesity %>%
mutate(BMI = Weight / (Height^2)) %>%
select(-Weight, -Height, -NObeyesdad) %>%
mutate(
Gender = if_else(Gender == "Female", 0, 1),
family_history_with_overweight = if_else(family_history_with_overweight == "no", 0, 1),
FAVC = if_else(FAVC == "no", 0, 1),
CAEC = case_when(
CAEC == "no" ~ 0,
CAEC == "Sometimes" ~ 1,
CAEC == "Frequently" ~ 2,
CAEC == "Always" ~ 3
),
SMOKE = if_else(SMOKE == "no", 0, 1),
SCC = if_else(SCC == "no", 0, 1),
CALC = case_when(
CALC == "no" ~ 0,
CALC == "Sometimes" ~ 1,
CALC == "Frequently" ~ 2,
CALC == "Always" ~ 3
),
MTRANS = case_when(
MTRANS == "Walking" ~ 0,
MTRANS == "Bike" ~ 1,
MTRANS == "Public_Transportation" ~ 2,
MTRANS == "Motorbike" ~ 3,
MTRANS == "Automobile" ~ 4
)
)
append_model_result <- function(name, actual_train, pred_train, actual_test, pred_test) {
tibble(
Model = name,
RMSE_Train = sqrt(mean((actual_train - pred_train)^2)),
RMSE_Test = sqrt(mean((actual_test - pred_test)^2)),
MAE_Train = mean(abs(actual_train - pred_train)),
MAE_Test = mean(abs(actual_test - pred_test)),
R2_Train = 1 - sum((pred_train - actual_train)^2) / sum((mean(actual_train) - actual_train)^2),
R2_Test = 1 - sum((pred_test - actual_test)^2) / sum((mean(actual_test) - actual_test)^2)
) %>%
mutate(across(where(is.numeric), \(x) round(x, 3)))
}
# define final results tibble
final_model_results <- tibble()
EDA_data <- read_csv("ObesityDataSet_raw_and_data_sinthetic.csv")
## Rows: 2111 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Gender, family_history_with_overweight, FAVC, CAEC, SMOKE, SCC, CAL...
## dbl (8): Age, Height, Weight, FCVC, NCP, CH2O, FAF, TUE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
############################
# Exploratory Data Analysis
############################
# View summary statistics for all features in EDA_data
summary(EDA_data)
## Gender Age Height Weight
## Length:2111 Min. :14.00 Min. :1.450 Min. : 39.00
## Class :character 1st Qu.:19.95 1st Qu.:1.630 1st Qu.: 65.47
## Mode :character Median :22.78 Median :1.700 Median : 83.00
## Mean :24.31 Mean :1.702 Mean : 86.59
## 3rd Qu.:26.00 3rd Qu.:1.768 3rd Qu.:107.43
## Max. :61.00 Max. :1.980 Max. :173.00
## family_history_with_overweight FAVC FCVC
## Length:2111 Length:2111 Min. :1.000
## Class :character Class :character 1st Qu.:2.000
## Mode :character Mode :character Median :2.386
## Mean :2.419
## 3rd Qu.:3.000
## Max. :3.000
## NCP CAEC SMOKE CH2O
## Min. :1.000 Length:2111 Length:2111 Min. :1.000
## 1st Qu.:2.659 Class :character Class :character 1st Qu.:1.585
## Median :3.000 Mode :character Mode :character Median :2.000
## Mean :2.686 Mean :2.008
## 3rd Qu.:3.000 3rd Qu.:2.477
## Max. :4.000 Max. :3.000
## SCC FAF TUE CALC
## Length:2111 Min. :0.0000 Min. :0.0000 Length:2111
## Class :character 1st Qu.:0.1245 1st Qu.:0.0000 Class :character
## Mode :character Median :1.0000 Median :0.6253 Mode :character
## Mean :1.0103 Mean :0.6579
## 3rd Qu.:1.6667 3rd Qu.:1.0000
## Max. :3.0000 Max. :2.0000
## MTRANS NObeyesdad
## Length:2111 Length:2111
## Class :character Class :character
## Mode :character Mode :character
##
##
##
head(final_data)
# Calculate BMI for plotting
EDA_data <- EDA_data %>%
mutate(BMI = Weight / (Height^2))
names(EDA_data)
## [1] "Gender" "Age"
## [3] "Height" "Weight"
## [5] "family_history_with_overweight" "FAVC"
## [7] "FCVC" "NCP"
## [9] "CAEC" "SMOKE"
## [11] "CH2O" "SCC"
## [13] "FAF" "TUE"
## [15] "CALC" "MTRANS"
## [17] "NObeyesdad" "BMI"
library(kknn)
## Warning: package 'kknn' was built under R version 4.3.3
##
## Attaching package: 'kknn'
## The following object is masked from 'package:caret':
##
## contr.dummy
# Create a clean data frame for training
train_data <- EDA_data[, c("Height", "Weight", "NObeyesdad")]
train_data$NObeyesdad <- as.factor(train_data$NObeyesdad)
train_data <- as.data.frame(train_data)
# Clean test grid
grid1 <- expand.grid(
Height = seq(min(train_data$Height), max(train_data$Height), length.out = 200),
Weight = seq(min(train_data$Weight), max(train_data$Weight), length.out = 200)
)
grid1 <- as.data.frame(grid1)
grid1$Height <- as.numeric(grid1$Height)
grid1$Weight <- as.numeric(grid1$Weight)
# Define formula explicitly
formula_knn <- as.formula("NObeyesdad ~ Height + Weight")
# Train model
knn_model <- kknn(formula = formula_knn, train = train_data, test = grid1, k = 15)
# Get predicted class for each point in the grid
grid1$class <- fitted(knn_model)
# Reorder class levels to match vertical visual stacking from top to bottom
levels_ordered <- c("Insufficient_Weight", "Normal_Weight",
"Overweight_Level_I","Overweight_Level_II",
"Obesity_Type_I", "Obesity_Type_II", "Obesity_Type_III")
EDA_data$NObeyesdad <- factor(EDA_data$NObeyesdad, levels = rev(levels_ordered))
grid1$class <- factor(grid1$class, levels = rev(levels_ordered))
# Plot with one unified legend
ggplot() +
geom_tile(data = grid1, aes(x = Height, y = Weight, fill = class), alpha = 0.25) +
geom_point(data = EDA_data, aes(x = Height, y = Weight, color = NObeyesdad), size = 1.2, alpha = 0.85) +
scale_fill_brewer(palette = "Dark2", name = "Class") +
scale_color_brewer(palette = "Dark2", name = "Class") +
labs(
title = "BMI Class Regions by Height and Weight (K-NN)",
subtitle = "Background shows predicted regions, points show actual classes",
x = "Height (m)",
y = "Weight (kg)"
) +
theme_minimal()
Although the dataset includes BMI category labels under ‘NObeyesdad’, it does not use numeric BMI values directly.
Instead, this plot reintroduces BMI indirectly by visualizing its components—Height and Weight—on the axes.
The background color regions are created using a k-NN classifier trained on these two features,
essentially mimicking how BMI partitions weight relative to height.
By coloring the regions based on predicted class and overlaying actual class points,
we can visually assess how cleanly the BMI space aligns with the labeled obesity categories.
# --- Correlation with BMI Only (Horizontal Layout) ---
cor_matrix <- final_data %>%
select(where(is.numeric)) %>%
cor()
# Extract and transpose BMI correlations
cor_bmi <- as.data.frame(cor_matrix["BMI", , drop = FALSE])
cor_bmi_t <- t(cor_bmi)
# Plot horizontally
ggcorrplot::ggcorrplot(
cor_bmi_t,
type = "full",
lab = TRUE,
lab_size = 3,
method = "square",
colors = c("blue", "white", "red"),
title = "Correlation of BMI with Other Features (Horizontal)",
ggtheme = theme_minimal()
)
############################
# Modelling Preparation
############################
# Design matrix for modeling
full_matrix <- model.matrix(BMI ~ . - 1, data = final_data)
full_label <- final_data$BMI
set.seed(123)
split <- createDataPartition(full_label, p = 0.8, list = FALSE)
train_matrix <- full_matrix[split, ]
test_matrix <- full_matrix[-split, ]
train_label <- full_label[split]
test_label <- full_label[-split]
#create linear regression model and print summary
lm_model <- lm(BMI ~ FAF + SCC + CALC, data = final_data)
summary(lm_model)
##
## Call:
## lm(formula = BMI ~ FAF + SCC + CALC, data = final_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.0101 -5.5762 0.1485 5.8514 20.7423
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.6525 0.3640 81.454 < 2e-16 ***
## FAF -1.4222 0.1975 -7.200 8.32e-13 ***
## SCC -6.6746 0.8031 -8.311 < 2e-16 ***
## CALC 2.4446 0.3250 7.522 7.96e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.667 on 2107 degrees of freedom
## Multiple R-squared: 0.08551, Adjusted R-squared: 0.08421
## F-statistic: 65.68 on 3 and 2107 DF, p-value: < 2.2e-16
There are 2 significant variables that have an impact on BMI - FAF (how often the person has physical activity) and SCCyes (does the person monitor their calories daily where the answer is yes)
The R^2 value is 0.1045, which is very low - this implies that model explains around 10% of the variance in the model.
# Diagnostics
ggplot2::autoplot(lm_model)
Residual vs Fitted:
The points are relatively evenly spread around the horizontal line (0), although there are more points above the horizontal line (especially when fitted values = ~30), which implies that the model does a decent job at capturing the linear relationship between BMI and FAF, SCC, and CALC, and errors are mostly random.
There are no odd patterns (curved), which implies that the relationship between BMI and FAF/SCC/CALC is not linear, so a linear regression model will not cover all the complexities of that relationship.
The increase in points as you move from left to right on the graph (as fitted values increase) implies heteroscedasticity, or that the error term does not have constant variance.
Q-Q Plot:
The points fit along the diagonal like for a small portion of the graph, diverging at either end.
The deviation implies that the data is not normally distributed, and might be skewed as well.
#residuals vs xaxis
plots = gg_resX(lm_model, plot.all = TRUE)
Looking at the plot for Residuals vs. FAF, the points are relatively evenly distributed around the red horizontal like (y = 0), implying the error is random. However, there is no real curved trend, so the relationship between BMI and FAF is nonlinear.
Looking at the plot for Residual vs. SCC, both medians are close to 0, which means there’s no real bias towards either group in the model. However, the IQR for ‘no’ is much larger than the IQR for ‘yes’, implying heteroscedasticity (violating an assumption of a linear model). There are also outliers for ‘yes’, which could mean that the model does a better job of explaining instances of ‘no’ rather than ‘yes’.
Finally looking at the plot for Residual vs. CALC, all 3 medians are around 0 so nothing in biased, but the IQR for ‘no’ and ‘sometimes’ are much larger than ‘always’ and ‘frequently’, and while there might simply be more observations in those categories, it also could imply that the model has heteroscedasticity and violates assumptions of linear models.
#residual histogram
gg_reshist(lm_model)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#cook's d
gg_cooksd(lm_model, threshold = 'matlab')
# Create train/test from original data (not matrix)
train_data_lm <- final_data[split, ]
test_data_lm <- final_data[-split, ]
# Fit model
lm_model <- lm(BMI ~ FAF + SCC + CALC, data = train_data_lm)
# Predict
lr_preds_train <- predict(lm_model, newdata = train_data_lm)
lr_preds_test <- predict(lm_model, newdata = test_data_lm)
# Actuals
actual_train <- train_data_lm$BMI
actual_test <- test_data_lm$BMI
# Metrics
lr_result <- append_model_result("Linear Regression", train_label, lr_preds_train, test_label, lr_preds_test)
# Add to final results
final_model_results <- bind_rows(final_model_results, lr_result)
lr_result
library(rpart)
library(rpart.plot)
# Train decision tree (depth limited to 3)
dt_model <- rpart(BMI ~ ., data = final_data[split, ], method = "anova",
control = rpart.control(maxdepth = 3))
# Visualize the tree
rpart.plot(dt_model, type = 2, extra = 101, fallen.leaves = TRUE,
box.palette = "Blues", main = "Decision Tree (Max Depth = 3)")
We started by training a basic decision tree to predict weight based on all other available variables in our dataset. To keep the model intentionally simple and interpretable, we restricted the tree’s complexity in two ways:
Limited the maximum depth to only 3 levels (preventing overcomplicated branches) Set a relatively high complexity parameter (cp=0.01) to prune unnecessary splits The ‘anova’ method specification tells R we’re performing regression (predicting continuous weight values) rather than classification.
For visualization, we created a clean, informative plot showing:
The hierarchical decision points (splits) based on important predictors The predicted weight values at each terminal node (leaf) A color gradient (using a blue palette) to help distinguish between different prediction ranges Subtle drop shadows to improve readability of the tree structure
This restrained approach gives us a model that’s: 1. Easy to explain to non-technical stakeholders 2. Quick to train and evaluate 3. Provides a baseline for comparing with more complex models later
The visualization serves as both a diagnostic tool (checking if splits make logical sense) and a communication aid (showing how different factors combine to influence weight predictions).
# Predict on train and test
dt_preds_train <- predict(dt_model, newdata = final_data[split, ])
dt_preds_test <- predict(dt_model, newdata = final_data[-split, ])
# Actual values
actual_train <- final_data$BMI[split]
actual_test <- final_data$BMI[-split]
# Calculate metrics
dt_result <- append_model_result("Decision Tree", train_label, dt_preds_train, test_label, dt_preds_test)
# Print summary
print(dt_result)
## # A tibble: 1 × 7
## Model RMSE_Train RMSE_Test MAE_Train MAE_Test R2_Train R2_Test
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Decision Tree 5.63 5.62 4.47 4.46 0.504 0.513
# Save for final comparison (append later)
final_model_results <- bind_rows(final_model_results, dt_result)
We used our simple decision tree to predict weights in two situations:
For people it already knew (training data) For new people it hadn’t seen before (test data) Then we calculated how far off our guesses were using RMSE (basically, how many pounds we’re off on average):
First we made predictions:
train_pred = guesses for our original group test_pred = guesses for the new group
Then we checked accuracy:
Squared all the differences between guesses and real weights Took the average and then the square root (that’s RMSE) This gives us one number showing typical error size
Finally we printed clean results:
Training error: Shows how well it learned the patterns Testing error: Shows how well it works on new people The printout gives us numbers like “12.34” meaning we’re typically about 12 pounds off in our weight guesses. Smaller numbers mean better predictions!
This helps us understand:
If our model is working properly Whether it’s guessing better for known vs new people How accurate we can expect it to be in real use
# Extract and format variable importance for Decision Tree
dt_importance <- tibble(
Feature = names(dt_model$variable.importance),
Importance = as.numeric(dt_model$variable.importance)
) %>%
arrange(desc(Importance))
# View top features (e.g., top 10)
print(head(dt_importance, 10))
## # A tibble: 10 × 2
## Feature Importance
## <chr> <dbl>
## 1 family_history_with_overweight 24987.
## 2 FCVC 13008.
## 3 CAEC 10969.
## 4 Age 8078.
## 5 MTRANS 3366.
## 6 TUE 2893.
## 7 Gender 1114.
## 8 CH2O 1112.
## 9 CALC 1062.
## 10 FAF 196.
Understanding What Really Affects Weight Predictions
After building our weight-prediction model, we wanted to know which factors mattered most. Here’s how we checked:
We extracted the “importance scores” from our decision tree model These scores show how much each characteristic (like age or height) helped make accurate predictions The more a variable was used to split the data, the higher its score
We organized this information neatly in a table with:
Column 1: The factor name (e.g., “Height”, “Age”) Column 2: Its importance score (higher numbers = more important)
We used the kable() function to display this as a clean, professional-looking table Added a clear title explaining what the table shows Made it easy to compare which factors are most influential
This helps us answer questions like:
• What lifestyle factors most affect weight predictions? • Should we focus on collecting more detailed data about certain factors? • Do the important variables make logical sense?
############################
# XGBoost Tuning Functions
############################
# running model and consolidating results
tune_xgboost_param <- function(param_name, param_values, fixed_params) {
results <- tibble(
!!param_name := numeric(),
RMSE = numeric(),
MAE = numeric(),
R2 = numeric()
)
for (val in param_values) {
# Set up the dynamic parameter list
params <- modifyList(fixed_params, setNames(list(val), param_name))
# Train model
model <- xgboost(
data = train_matrix,
label = train_label,
objective = "reg:squarederror",
max_depth = params$max_depth,
eta = params$eta,
nrounds = params$nrounds,
verbose = 0
)
# Predict and evaluate
preds <- predict(model, newdata = test_matrix)
rmse <- sqrt(mean((preds - test_label)^2))
mae <- mean(abs(preds - test_label))
r2 <- 1 - sum((preds - test_label)^2) / sum((mean(test_label) - test_label)^2)
# Append to results
results <- add_row(results,
!!param_name := val,
RMSE = round(rmse, 3),
MAE = round(mae, 3),
R2 = round(r2, 3))
}
return(results)
}
# plot the results & and identify optimal level
plot_xgb_metric_curve <- function(results_df, hyperparam = "max_depth") {
# Find optimal row based on lowest RMSE
optimal_row <- results_df[which.min(results_df$RMSE), ]
optimal_value <- optimal_row[[hyperparam]]
# Reshape for plotting
results_long <- results_df %>%
pivot_longer(cols = c(RMSE, MAE), names_to = "Metric", values_to = "value")
# Plot
ggplot(results_long, aes(x = .data[[hyperparam]], y = value, color = Metric)) +
geom_line(size = 1.2) +
geom_point() +
geom_vline(xintercept = optimal_value, linetype = "dashed", color = "red", size = 1) +
annotate("text",
x = optimal_value + 0.015,
y = median(results_long$value) + .5, # More stable than max()
label = "Optimal",
color = "red",
angle = 90,
vjust = -0.2,
size = 4) +
theme_minimal() +
labs(
title = paste("Model Performance by", gsub("_", " ", tools::toTitleCase(hyperparam))),
x = tools::toTitleCase(gsub("_", " ", hyperparam)),
y = "Metric Value"
)
}
############################
# Depth Tuning (eta = 0.15, nrounds = 100)
############################
depth_results <- tune_xgboost_param("max_depth", seq(3, 12, by = 1),
fixed_params = list(eta = 0.15, nrounds = 100))
depth_plot <- plot_xgb_metric_curve(depth_results, hyperparam = "max_depth")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
depth_results
depth_plot
# ######################################
# Eta Tuning (max_depth = 8, nrounds = 100)
# ######################################
eta_results <- tune_xgboost_param("eta",
seq(0.05, 0.3, by = 0.025),
fixed_params = list(max_depth = 8, nrounds = 100))
eta_plot <- plot_xgb_metric_curve(eta_results, hyperparam = "eta")
eta_results
eta_plot
# ##############################################
# nrounds Tuning (max_depth = 8, eta = 0.15)
# ##############################################
nrounds_results <- tune_xgboost_param(
"nrounds",
seq(20, 300, by = 20),
fixed_params = list(max_depth = 8, eta = 0.15)
)
nrounds_plot <- plot_xgb_metric_curve(nrounds_results, hyperparam = "nrounds")
nrounds_results
nrounds_plot
XGBoost model significantly outperforms the linear model:
RMSE dropped from 7.59 → 2.847
MAE was also very low at ~1.9, suggesting consistent predictions
The linear model is extremely simple with only 3 predictors. It’s interpretable — you can say “a 1 unit increase in FAF reduces BMI by 1.29 units” — but it misses non-linear interactions.
XGBoost model used every available variable and automatically captured complex patterns the linear model couldn’t.
If interpretability and simplicity are the goal (e.g., doctor working on a patient to patient basis trying to suggest lifestyle changes and explain health factors to the patient), the linear model makes sense.
If the goal is predictive performance (e.g., identifying at-risk individuals for prediabetic care with highest accuracy possible), XGBoost model is much better.
# ##############################################
# Final Tuned XGBoost Model
# ##############################################
# Train final model with optimized hyperparameters
final_model <- xgboost(
data = train_matrix,
label = train_label,
objective = "reg:squarederror",
max_depth = 8,
eta = 0.15,
nrounds = 80,
verbose = 0
)
# Evaluate XGBoost on both train and test sets
xgb_preds_train <- predict(final_model, newdata = train_matrix)
xgb_preds_test <- predict(final_model, newdata = test_matrix)
# Calculate metrics
xgb_results <- append_model_result("XGBoost", train_label, xgb_preds_train, test_label, xgb_preds_test)
# Print and store
print(xgb_results)
## # A tibble: 1 × 7
## Model RMSE_Train RMSE_Test MAE_Train MAE_Test R2_Train R2_Test
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 XGBoost 0.664 2.85 0.41 1.88 0.993 0.875
final_model_results <- bind_rows(final_model_results, xgb_results)
# Feature importance using final_model (Based on gain)
importance_matrix <- xgb.importance(model = final_model, feature_names = colnames(train_matrix))
# View as table
print(importance_matrix)
## Feature Gain Cover Frequency
## 1: family_history_with_overweight 0.2860664643 0.03590969 0.024422269
## 2: FCVC 0.1454438539 0.09477250 0.095456933
## 3: Age 0.1214702167 0.25793271 0.271927521
## 4: CAEC 0.1210675927 0.04564219 0.040309874
## 5: FAF 0.0710067254 0.13175083 0.119747899
## 6: TUE 0.0685307091 0.10431462 0.085215336
## 7: NCP 0.0664809652 0.07799870 0.078912815
## 8: MTRANS 0.0473172015 0.02361420 0.027836134
## 9: Gender 0.0322425078 0.02341712 0.067489496
## 10: CH2O 0.0211133799 0.14943158 0.114364496
## 11: CALC 0.0086893973 0.01777546 0.033350840
## 12: SCC 0.0055729744 0.01316322 0.009059874
## 13: FAVC 0.0039991116 0.01161241 0.024028361
## 14: SMOKE 0.0009989003 0.01266478 0.007878151
# Optional: Plot top 15 features
xgb.plot.importance(
importance_matrix,
top_n = 15,
measure = "Gain",
rel_to_first = TRUE,
xlab = "Relative Importance"
)
# Step 1: Prepare DMatrix for SHAP
dtrain <- xgb.DMatrix(data = train_matrix, label = train_label)
# Step 2: Compute SHAP values
shap_values <- shap.values(xgb_model = final_model, X_train = train_matrix)
# Step 3: Create SHAP summary
shap_summary <- shap_values$mean_shap_score
shap_importance <- data.frame(
Feature = names(shap_summary),
Mean_SHAP = shap_summary
) |>
arrange(desc(Mean_SHAP))
# View top 10 SHAP features
head(shap_importance, 10)
Strong agreement across SHAP and Gain rankings suggests these top features are truly valuable (especially family history, FCVC, and Age).
SHAP provides extra context: it shows that even features like CH2O and Gender that aren’t frequently split on still influence predictions consistently.
Gain shows how much each split helps reduce error, so it’s more tree-structure specific.
SHAP captures the total effect on the model’s prediction output, even if the feature is only used in subtle ways.
final_model_results %>%
arrange(RMSE_Test) %>%
kable(caption = "Model Comparison: Sorted by Test RMSE")
Model | RMSE_Train | RMSE_Test | MAE_Train | MAE_Test | R2_Train | R2_Test |
---|---|---|---|---|---|---|
XGBoost | 0.664 | 2.847 | 0.410 | 1.882 | 0.993 | 0.875 |
Decision Tree | 5.634 | 5.615 | 4.469 | 4.461 | 0.504 | 0.513 |
Linear Regression | 7.631 | 7.777 | 6.390 | 6.416 | 0.090 | 0.065 |
# --- Step 1: Rank features for each model ---
# XGBoost top 10 features
xgb_top <- xgb.importance(model = final_model, feature_names = colnames(train_matrix)) %>%
slice_max(Gain, n = 10) %>%
mutate(Rank = row_number()) %>%
select(Rank, XGBoost = Feature)
# Decision Tree top 10 features (coerce to tibble explicitly)
dt_top <- tibble(
Feature = names(dt_model$variable.importance),
Importance = as.numeric(dt_model$variable.importance)
) %>%
arrange(desc(Importance)) %>%
dplyr::slice(1:10) %>%
mutate(Rank = row_number()) %>%
select(Rank, Decision_Tree = Feature)
# Linear Regression top 3 features based on t-stat
lm_summary <- summary(lm_model)
lr_top <- tibble(
Feature = rownames(lm_summary$coefficients),
T_stat = lm_summary$coefficients[, "t value"]
) %>%
filter(Feature != "(Intercept)") %>%
arrange(desc(abs(T_stat))) %>%
mutate(Rank = row_number()) %>%
dplyr::slice(1:3) %>%
select(Rank, Linear_Regression = Feature)
# --- Step 2: Merge all rankings into a single table ---
# Create a full Rank index (1 to 10)
rank_frame <- tibble(Rank = 1:10)
# Join all ranking tables
feature_rankings <- rank_frame %>%
left_join(xgb_top, by = "Rank") %>%
left_join(dt_top, by = "Rank") %>%
left_join(lr_top, by = "Rank")
# --- Step 3: Print clean table ---
library(knitr)
kable(feature_rankings, caption = "Top Feature Rankings Across Models")
Rank | XGBoost | Decision_Tree | Linear_Regression |
---|---|---|---|
1 | family_history_with_overweight | family_history_with_overweight | SCC |
2 | FCVC | FCVC | CALC |
3 | Age | CAEC | FAF |
4 | CAEC | Age | NA |
5 | FAF | MTRANS | NA |
6 | TUE | TUE | NA |
7 | NCP | Gender | NA |
8 | MTRANS | CH2O | NA |
9 | Gender | CALC | NA |
10 | CH2O | FAF | NA |
Conclusion:
Methods - Describe building a shallow Decision Tree with depth = 3, reason for balance between interpretability and prediction. Results - Report Training RMSE, Testing RMSE, include the Decision Tree plot (Figure 1), and Variable Importance (Table 1). Discussion - Discuss Decision Trees being simple to interpret but sometimes underfitting complex patterns.
1. Linear Regression
Pros:
Interpretability: Linear regression is easily interpretable. The coefficients directly show the relationship between each predictor and the outcome variable
Simplicity: It’s a simple model to understand/implement
Cons:
Not for predicting: In this model, the R^2 of 0.105 indicates very low predictive power (the model explains only a small portion of the variance in BMI)
Violated Linear Assumptions: Linear regression relies on several assumptions (linearity, independence, homoscedasticity, normality of errors), which were violated based on my diagnostic plots
Not Complex: It can only model linear relationships, and this data is not linearly related, so it misses complexities in the data
2. XGBoost
Pros:
Good for predicting: XGBoost outperformed linear regression with a lower RMSE and MAE. It better captures complex relationships and makes good predictions
Handles Nonlinear relations: XGBoost can handle/capture the nonlinear relationship that is present between variables
Cons:
Interpretability: XGBoost models are harder to interpret that linear regression
Complexity: XGBoost has many hyperparameters that require tuning
3. Decision Tree
Pros:
Interpretability: Decision trees are relatively easy to interpret, especially shallow trees like the one we used
Visualization: Decision trees are easily visualized so we can see how the model works
Cons:
Not great at predicting: Shallow decision trees don’t predict as well as XGBoost
Potential Underfitting: Shallow trees have the potential to underfit the data, so it doesn’t get all the patterns in the data
For our case:
The linear regression model is probably inadequate because the data is nonlinear, so it can’t accurately capture the relationship, and also the assumptions of linear models are violated, so the model is not ‘good’
The XGBoost model is probably the best - it has a higher predictive power and better model results, although it’s more difficult to interpret than linear regression or decision tree