Load/Clean Data (Carson)

# --- Load All Required Libraries ---
# Core data manipulation
library(tidyverse)     # includes ggplot2, dplyr, readr, etc.

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(data.table)    # fast data manipulation

## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose

library(caret)         # train/test splitting, ML tools

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(Metrics)       # evaluation metrics

## Warning: package 'Metrics' was built under R version 4.3.3

## 
## Attaching package: 'Metrics'
## 
## The following objects are masked from 'package:caret':
## 
##     precision, recall

# Plotting and visualization
library(ggthemes)      # enhanced ggplot themes

## Warning: package 'ggthemes' was built under R version 4.3.3

library(ggcorrplot)    # correlation plots

## Warning: package 'ggcorrplot' was built under R version 4.3.3

library(ggrepel)       # improved text labels for ggplot

## Warning: package 'ggrepel' was built under R version 4.3.2

library(GGally)        # extended ggplot

## Warning: package 'GGally' was built under R version 4.3.3

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(ggfortify)     # auto plotting for statistical objects

## Warning: package 'ggfortify' was built under R version 4.3.3

library(patchwork)     # combine plots

## Warning: package 'patchwork' was built under R version 4.3.3

library(lindia)        # linear model diagnostics

## Warning: package 'lindia' was built under R version 4.3.3

# Modeling tools
library(xgboost)       # gradient boosting

## Warning: package 'xgboost' was built under R version 4.3.3

## 
## Attaching package: 'xgboost'
## 
## The following object is masked from 'package:dplyr':
## 
##     slice

library(rpart)         # decision trees
library(rpart.plot)    # decision tree visualization
library(kknn)          # k-nearest neighbors

## Warning: package 'kknn' was built under R version 4.3.3

## 
## Attaching package: 'kknn'
## 
## The following object is masked from 'package:caret':
## 
##     contr.dummy

library(SHAPforxgboost) # SHAP values for xgboost

## Warning: package 'SHAPforxgboost' was built under R version 4.3.3

library(knitr)         # table formatting for outputs
library(broom)         # tidy model outputs

#########################
# Data Pre Processing
#########################

# Load dataset
obesity <- read_csv("ObesityDataSet_raw_and_data_sinthetic.csv")

## Rows: 2111 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Gender, family_history_with_overweight, FAVC, CAEC, SMOKE, SCC, CAL...
## dbl (8): Age, Height, Weight, FCVC, NCP, CH2O, FAF, TUE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Define readable feature names mapping
feature_names <- c(
  "family_history_with_overweight" = "Family History",
  "MTRANS" = "Transportation Mode",
  "CAEC" = "Eating Between Meals",
  "FAVC" = "High-Calorie Food", 
  "CALC" = "Alcohol Consumption",
  "FCVC" = "Fruit & Veg. Consumption",
  "FAF" = "Physical Activity",
  "SCC" = "Calorie Monitoring",
  "NCP" = "Number of Meals",
  "CH2O" = "Water Consumption",
  "TUE" = "Technology Use Time",
  "SMOKE" = "Smoking Status"
)

# Utility function to rename features consistently
rename_with_mapping <- function(df_or_vector, mapping = feature_names) {
  # Check if input is a data frame
  if (is.data.frame(df_or_vector)) {
    # For data frames, rename columns
    cols_to_rename <- intersect(colnames(df_or_vector), names(mapping))
    if (length(cols_to_rename) > 0) {
      colnames(df_or_vector)[match(cols_to_rename, colnames(df_or_vector))] <- 
        mapping[cols_to_rename]
    }
  } else if (is.matrix(df_or_vector) || is.vector(df_or_vector)) {
    # For matrices or vectors, rename rows or names
    if (!is.null(rownames(df_or_vector))) {
      names_to_rename <- intersect(rownames(df_or_vector), names(mapping))
      if (length(names_to_rename) > 0) {
        rownames(df_or_vector)[match(names_to_rename, rownames(df_or_vector))] <- 
          mapping[names_to_rename]
      }
    } else if (!is.null(names(df_or_vector))) {
      names_to_rename <- intersect(names(df_or_vector), names(mapping))
      if (length(names_to_rename) > 0) {
        names(df_or_vector)[match(names_to_rename, names(df_or_vector))] <- 
          mapping[names_to_rename]
      }
    }
  }
  
  return(df_or_vector)
}

# Clean and encode dataset in single transformation
final_data <- obesity %>%
  mutate(BMI = Weight / (Height^2)) %>%
  select(-Weight, -Height, -NObeyesdad) %>%
  mutate(
    Gender = if_else(Gender == "Female", 0, 1),
    family_history_with_overweight = if_else(family_history_with_overweight == "no", 0, 1),
    FAVC = if_else(FAVC == "no", 0, 1),
    CAEC = case_when(
      CAEC == "no" ~ 0, CAEC == "Sometimes" ~ 1, 
      CAEC == "Frequently" ~ 2, CAEC == "Always" ~ 3
    ),
    SMOKE = if_else(SMOKE == "no", 0, 1),
    SCC = if_else(SCC == "no", 0, 1),
    CALC = case_when(
      CALC == "no" ~ 0, CALC == "Sometimes" ~ 1,
      CALC == "Frequently" ~ 2, CALC == "Always" ~ 3
    ),
    MTRANS = case_when(
      MTRANS == "Walking" ~ 0, MTRANS == "Bike" ~ 1,
      MTRANS == "Public_Transportation" ~ 2, MTRANS == "Motorbike" ~ 3,
      MTRANS == "Automobile" ~ 4
    )
  )

# Consolidated function for model result assessment
append_model_result <- function(name, actual_train, pred_train, actual_test, pred_test) {
  tibble(
    Model = name,
    RMSE_Train = sqrt(mean((actual_train - pred_train)^2)),
    RMSE_Test = sqrt(mean((actual_test - pred_test)^2)),
    MAE_Train = mean(abs(actual_train - pred_train)),
    MAE_Test = mean(abs(actual_test - pred_test)),
    R2_Train = 1 - sum((pred_train - actual_train)^2) / sum((mean(actual_train) - actual_train)^2),
    R2_Test = 1 - sum((pred_test - actual_test)^2) / sum((mean(actual_test) - actual_test)^2)
  ) %>%
  mutate(across(where(is.numeric), \(x) round(x, 3)))
}

# Initialize results storage
final_model_results <- tibble()

# Create EDA dataset with factor levels preserved
EDA_data <- read_csv("ObesityDataSet_raw_and_data_sinthetic.csv") %>%
  mutate(
    BMI = Weight / (Height^2),
    CAEC = factor(CAEC, levels = c("no", "Sometimes", "Frequently", "Always")),
    CALC = factor(CALC, levels = c("no", "Sometimes", "Frequently", "Always")),
    MTRANS = factor(MTRANS, levels = c("Walking", "Bike", "Public_Transportation", "Motorbike", "Automobile")),
    FAVC = factor(FAVC, levels = c("no", "yes")),
    SMOKE = factor(SMOKE, levels = c("no", "yes")),
    SCC = factor(SCC, levels = c("no", "yes")),
    family_history_with_overweight = factor(family_history_with_overweight, levels = c("no", "yes")),
    Gender = factor(Gender, levels = c("Female", "Male"))
  )

## Rows: 2111 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Gender, family_history_with_overweight, FAVC, CAEC, SMOKE, SCC, CAL...
## dbl (8): Age, Height, Weight, FCVC, NCP, CH2O, FAF, TUE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

EDA (Carson)

Our data is made up of 2,111 instances of obesity levels with 17 attributes and the target. There are no nulls and the dataset is fairly clean already
It was collected in 2019
23% of the data was collected organically with web surveys and the other 77% was generated synthetically with SMOTE.
It was collected from individuals in 3 Latin American countries (Mexico, Peru, and Colombia) so the target feature and others are in Spanish or using their abbreviations.
The target variable is 7 different classifications of weight level based on their weight indexing scale, NObeyesdad. This differs from how the U.S. typically does the classification as we typically use 4 classes (underweight, normal, overweight, and obese), or 5 sometimes including extremely obese.
They also provide weight and height and height and for this reason we’ll turn this to a regression task rather than classification, and attempt to predict BMI

_ Here’s a quick summary of those in our initial EDA dataset:

############################
# Exploratory Data Analysis
############################
# --- Summary Stats ---

# View summary statistics for all features in EDA_data
summary(EDA_data)

##     Gender          Age            Height          Weight      
##  Female:1043   Min.   :14.00   Min.   :1.450   Min.   : 39.00  
##  Male  :1068   1st Qu.:19.95   1st Qu.:1.630   1st Qu.: 65.47  
##                Median :22.78   Median :1.700   Median : 83.00  
##                Mean   :24.31   Mean   :1.702   Mean   : 86.59  
##                3rd Qu.:26.00   3rd Qu.:1.768   3rd Qu.:107.43  
##                Max.   :61.00   Max.   :1.980   Max.   :173.00  
##  family_history_with_overweight  FAVC           FCVC            NCP       
##  no : 385                       no : 245   Min.   :1.000   Min.   :1.000  
##  yes:1726                       yes:1866   1st Qu.:2.000   1st Qu.:2.659  
##                                            Median :2.386   Median :3.000  
##                                            Mean   :2.419   Mean   :2.686  
##                                            3rd Qu.:3.000   3rd Qu.:3.000  
##                                            Max.   :3.000   Max.   :4.000  
##          CAEC      SMOKE           CH2O        SCC            FAF        
##  no        :  51   no :2067   Min.   :1.000   no :2015   Min.   :0.0000  
##  Sometimes :1765   yes:  44   1st Qu.:1.585   yes:  96   1st Qu.:0.1245  
##  Frequently: 242              Median :2.000              Median :1.0000  
##  Always    :  53              Mean   :2.008              Mean   :1.0103  
##                               3rd Qu.:2.477              3rd Qu.:1.6667  
##                               Max.   :3.000              Max.   :3.0000  
##       TUE                 CALC                        MTRANS    
##  Min.   :0.0000   no        : 639   Walking              :  56  
##  1st Qu.:0.0000   Sometimes :1401   Bike                 :   7  
##  Median :0.6253   Frequently:  70   Public_Transportation:1580  
##  Mean   :0.6579   Always    :   1   Motorbike            :  11  
##  3rd Qu.:1.0000                     Automobile           : 457  
##  Max.   :2.0000                                                 
##   NObeyesdad             BMI       
##  Length:2111        Min.   :13.00  
##  Class :character   1st Qu.:24.33  
##  Mode  :character   Median :28.72  
##                     Mean   :29.70  
##                     3rd Qu.:36.02  
##                     Max.   :50.81

Our data was all ordinal in some form so I went ahead and make all of them numeric in the pre processing for my modelling set for later, this is how that came out:

# final dataset with numeric categorical features
head(final_data)

First, we’ll look at the target feature

# --- Target Variable Analysis ---

# Create a clean data frame for training
train_data <- EDA_data[, c("Height", "Weight", "NObeyesdad")]
train_data$NObeyesdad <- as.factor(train_data$NObeyesdad)
train_data <- as.data.frame(train_data)

# Clean test grid
grid1 <- expand.grid(
  Height = seq(min(train_data$Height), max(train_data$Height), length.out = 200),
  Weight = seq(min(train_data$Weight), max(train_data$Weight), length.out = 200)
)
grid1 <- as.data.frame(grid1)
grid1$Height <- as.numeric(grid1$Height)
grid1$Weight <- as.numeric(grid1$Weight)

# Define formula explicitly
formula_knn <- as.formula("NObeyesdad ~ Height + Weight")

# Train model
knn_model <- kknn(formula = formula_knn, train = train_data, test = grid1, k = 15)

# Get predicted class for each point in the grid
grid1$class <- fitted(knn_model)

# Reorder class levels to match vertical visual stacking from top to bottom
levels_ordered <- c("Insufficient_Weight", "Normal_Weight",
                    "Overweight_Level_I","Overweight_Level_II",
                    "Obesity_Type_I", "Obesity_Type_II", "Obesity_Type_III")
EDA_data$NObeyesdad <- factor(EDA_data$NObeyesdad, levels = rev(levels_ordered))
grid1$class <- factor(grid1$class, levels = rev(levels_ordered))

# Plot with one unified legend
ggplot() +
  geom_tile(data = grid1, aes(x = Height, y = Weight, fill = class), alpha = 0.25) +
  geom_point(data = EDA_data, aes(x = Height, y = Weight, color = NObeyesdad), size = 1.2, alpha = 0.85) +
  scale_fill_brewer(palette = "Dark2", name = "Class") +
  scale_color_brewer(palette = "Dark2", name = "Class") +
  labs(
    title = "BMI Class Regions by Height and Weight (K-NN)",
    subtitle = "Background shows predicted regions, points show actual classes",
    x = "Height (m)",
    y = "Weight (kg)"
  ) +
  theme_minimal()

Interpretation:

Although the dataset includes BMI category labels under ‘NObeyesdad’, it does not use numeric BMI values directly.
Instead, this plot reintroduces BMI indirectly by visualizing its components, Height and Weight.
The background color regions are created using a k-NN classifier trained on these two features, essentially mimicking how BMI partitions weight relative to height.
By coloring the regions based on predicted class and overlaying actual class points, we can visually assess how cleanly the BMI space aligns with the labeled obesity categories.
From here we’ll introduce the actual target feature, BMI, which is (weight(kg))/(height(m)^2) and how each of our explanatory feature relate to the target.
we’ll do this in 2 ways:

2.) Univariate Analysis

3.) Bivariate Analysis
The analysis will be done in 3 ways:

1.) Correlation

2.) Variance

3.) Visualizations

# --- Correlation with BMI Only ---
# Calculate correlation matrix
cor_matrix <- final_data %>%
  select(where(is.numeric)) %>%
  cor()

# Extract and transpose BMI correlations
cor_bmi <- as.data.frame(cor_matrix["BMI", , drop = FALSE])
cor_bmi_t <- t(cor_bmi)

# Create a named vector for column renaming
col_mapping <- feature_names[names(feature_names) %in% rownames(cor_bmi_t)]
  
# Apply the mapping to rename the rows
rownames(cor_bmi_t)[match(names(col_mapping), rownames(cor_bmi_t))] <- col_mapping

# Plot horizontally
ggcorrplot::ggcorrplot(
  cor_bmi_t,
  type = "full",
  lab = TRUE,
  lab_size = 3,
  method = "square",
  colors = c("blue", "white", "red"),
  title = "Correlation of BMI with Other Features",
  ggtheme = theme_minimal()
)

Interpretaion:

Fam. history, Age, freq, to eat high calorie food, freq. to eat Fr&Veg have strongest positive linear relationship
Eating between meals(CAEC), calorie monitoring, and phys activity have the strongest negative linear relationship
this isn’t to say others aren’t influencing BMI, it just isn’t directly linear, next I’ll look at how well each explains the variance in BMI

# --- Single Feature BMI Variance Analysis ---

# Get variance explained by each feature
feature_importance <- tibble(
  Feature = character(),
  Variance_Explained = numeric()
)

# Features to test
features_to_test <- c(
  "Age", "FAF", "FCVC", "NCP", "CH2O", "TUE",
  "Gender", "family_history_with_overweight", "FAVC", 
  "CAEC", "SMOKE", "SCC", "CALC", "MTRANS"
)
# More efficient approach using map_dfr
feature_importance <- map_dfr(features_to_test, function(feature) {
  # For categorical features
  if (is.factor(EDA_data[[feature]]) || is.character(EDA_data[[feature]]) || 
      length(unique(EDA_data[[feature]])) < 10) {
    
    formula_str <- paste("BMI ~", feature)
    model <- aov(as.formula(formula_str), data = final_data)
    anova_result <- summary(model)
    
    ss_total <- sum(anova_result[[1]]$`Sum Sq`)
    ss_model <- anova_result[[1]]$`Sum Sq`[1]
    r2 <- ss_model / ss_total
  } else {
    formula_str <- paste("BMI ~", feature)
    model <- lm(as.formula(formula_str), data = final_data)
    r2 <- summary(model)$r.squared
  }
  
  tibble(Feature = feature, Variance_Explained = r2)
})

# Get top features
top_features <- feature_importance %>%
  arrange(desc(Variance_Explained)) %>%
  head(8) %>%
  mutate(
    # Use our rename_with_mapping function to get readable feature names
    Feature = ifelse(Feature %in% names(feature_names), 
                    feature_names[Feature], 
                    Feature),
    # Format as percentage
    Variance_Explained = paste0(round(Variance_Explained * 100, 1), "%")
  )

# Display table
kable(top_features,
     caption = "Top Features Ranked by BMI Variance Explained",
     col.names = c("Feature", "BMI Variance Explained"))

Top Features Ranked by BMI Variance Explained
Feature	BMI Variance Explained
Family History	23.4%
Eating Between Meals	9.8%
Fruit & Veg. Consumption	7%
High-Calorie Food	6.1%
Age	6%
Calorie Monitoring	3.4%
Physical Activity	3.2%
Alcohol Consumption	2.9%

Interpretation:

Our correlation values show that similar feature have the largest impact on BMI, Family History, CAEC, FCVC, FAVC, & Age are the same top 5 in terms of magnitude for corr.
Next we’ll visualize these relationships – starting with our continuous predictors. I’ll make this as clean as possible with three concise plots:

1.) A scatter plot between the feature and our target with a smoothed trend line

2.) A histogram to plot the distribution of the predictor

3.) A histogram to analyze the distribution of the target, using the designated predictor as a color scale filter

# --- Continuous Predictor Analysis ---

# Create a unified function for continuous predictor analysis
continuous_analyzer <- function(var_name, data = EDA_data) {
  # Get readable variable label using our mapping
  var_label <- if (var_name %in% names(feature_names)) feature_names[var_name] else var_name
  
  p1 <- ggplot(data, aes_string(x = var_name, y = "BMI")) +
    geom_point(alpha = 0.5, color = "darkblue") +
    geom_smooth(method = "loess", se = FALSE, color = "red") +
    labs(title = paste("BMI vs", var_label), x = var_label, y = "BMI") +
    theme_minimal()
  
  p2 <- ggplot(data, aes_string(x = var_name)) +
    geom_histogram(fill = "skyblue", color = "black", bins = 30) +
    labs(title = paste(var_label, "Distribution"), x = var_label, y = "Count") +
    theme_minimal()
  
  # Bin BMI and compute mean of selected variable per bin
  bin_width <- (max(data$BMI, na.rm = TRUE) - min(data$BMI, na.rm = TRUE)) / 30
  data_binned <- data %>%
    mutate(BMI_bin = cut(BMI, breaks = 30)) %>%
    mutate(BMI_mid = (as.numeric(sub("\\((.+),.*", "\\1", BMI_bin)) + 
                     as.numeric(sub("[^,]*,([^]]*)\\]", "\\1", BMI_bin))) / 2)
  
  bin_summary <- data_binned %>%
    group_by(BMI_mid) %>%
    summarise(count = n(), avg_val = mean(.data[[var_name]], na.rm = TRUE)) %>%
    filter(!is.na(BMI_mid))

  # BMI histogram shaded by average feature
  p3 <- ggplot(bin_summary, aes(x = BMI_mid, y = count, fill = avg_val)) +
    geom_col(color = "black") +
    scale_fill_gradient(low = "green", high = "red", name = paste("Avg", var_label)) +
    labs(title = paste("BMI Distribution\n(shaded by avg.", var_label, ")"),
         x = "BMI", y = "Count", fill = paste("Avg", var_label)) +
    theme_minimal()
  
  # Combine plots with patchwork
  library(patchwork)
  (p1 | p2) / p3
}

# Apply to Age
continuous_analyzer("Age")

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `geom_smooth()` using formula = 'y ~ x'

It’s kind of weird how there’s very little data for young people in that middle bin for BMI. They definitely have young people of that size in the latin american countries this data came from.

continuous_analyzer("FCVC")

## `geom_smooth()` using formula = 'y ~ x'

Interesting how apparently people with the highest average fruit and vegetable consumption have the high BMI’s too.
Are people inflating their consumption of healthy food to feel better about themselves or is this just tied to overall higher food consumption?
People in this sample eat a lot more fruit and vegetables than I’d expect this result to be in the US. On a scale of 1 to 3, very few selected 1.
this feature is continuous but it almost should just be converted to ordered classes that are either 1, 2, or 3 because no one used decimals in the survey

continuous_analyzer("NCP")

## `geom_smooth()` using formula = 'y ~ x'

A large majority of people eat 3 a day as I’d expect
The people who ate the least amount of meals per day had very average BMI’s
I has a similar distribution to FCVC in that it should almost be 4 classes rather than continuous, but this one is much more imbalanced than the last so it’s not going to provide much predictive power I’d imagine

continuous_analyzer("FAF")

## `geom_smooth()` using formula = 'y ~ x'

Pretty consistent negative correlation here like I’d expect, the more people work out the lower their BMI gets.
In fact, it looks like there isn’t even one instance above a BMI of 35 once the person got above an FAF level of 2
This feature is imbalanced and skewed towards less exercise

continuous_analyzer("TUE")

## `geom_smooth()` using formula = 'y ~ x'

I’m not exactly sure how they’re rating their 0-2 here, but it seems as if the tech usage of this sample is much lower than I’d expect in the US with how many of them are at zero. The sample is a 2019 collection of individuals from Mexico, Peru, and Colombia
There’s a pretty strong spike in BMI among people around a 0.5 TUE level

continuous_analyzer("CH2O")

## `geom_smooth()` using formula = 'y ~ x'

Water consumption has a similar relationship as fruits and vegetables does with more water for higher BMI levels

# --- Categorical & Binary Predictor Analysis ---

# Define a reusable boxplot function with count labels
make_bmi_boxplot <- function(var_name) {
  # Get readable variable label using our mapping
  var_label <- if (var_name %in% names(feature_names)) feature_names[var_name] else var_name
  
  # Calculate counts for each level
  counts <- table(EDA_data[[var_name]])
  
  # Create labels with counts
  count_labels <- paste0(names(counts), "\n(n=", counts, ")")
  
  ggplot(EDA_data, aes_string(x = var_name, y = "BMI")) +
    geom_boxplot(fill = "skyblue", color = "black", outlier.alpha = 0.3) +
    scale_x_discrete(labels = count_labels) +  # Apply the count labels
    labs(
      x = var_label,
      y = "BMI",
      title = paste("BMI by", var_label)
    ) +
    theme_minimal(base_size = 14) +
    theme(
      axis.text.x = element_text(angle = 30, hjust = 1, size = 16),
      axis.text.y = element_text(size = 16),
      plot.title = element_text(size = 16, face = "bold"),
      axis.title = element_text(size = 16)
    )
}

# Create each plot
p1 <- make_bmi_boxplot("Gender")
p2 <- make_bmi_boxplot("family_history_with_overweight")
p3 <- make_bmi_boxplot("FAVC")
p4 <- make_bmi_boxplot("CAEC")
p5 <- make_bmi_boxplot("SMOKE")
p6 <- make_bmi_boxplot("SCC")
p7 <- make_bmi_boxplot("CALC")
p8 <- make_bmi_boxplot("MTRANS")

# Assemble in a 4x2 grid using patchwork
wrap_plots(p1, p2, p3, p4, p5, p6, p7, p8, ncol = 2) +
  plot_layout(guides = "collect")

Interpretation:

_ As a whole we see a lot of imbalance in features that seem unrealistic, and I blame this on the synthetic SMOTE data under sampling things like smokers and over sampling things like family history of obesity

Gender:

Men have a slightly higher BMI median but their IQR is much tighter, so women have a higher level of diversity in their BMI levels while men are pretty closely grouped around the middle.

Family History:

There appears to be a strong correlation between family history and BMI levels showing the importance of genetics as well as eating habits picked up from parents.
The imbalance between yes and no is interesting. There is way more ‘yes’ instances leading me to believe that a wide variety of things are considered family history of obesity aside from genetic disposition. Even if the deciding factor was having just a single overweight parent, I still wouldn’t expect a 1726 to 385 split.

Smoking:

The medians and IQR’s are almost identical, but non-smokers appear to be more likely to end up around the extremes of each end of the BMI spectrum.
Part of that is likely related to the class imbalance. This sample has only 44 smokers which leads me to doubt this feature.
I doubt that anywhere in the world has a cigarette smoking level at ~2% like this sample has

Calorie Monitoring:

People who don’t track calories generally had a much higher BMI
The small amount (96) that did track them had a median about 5 points lower

Mode of Transportation:

the 63 who walked and biked normally were considerably lower in BMI (~5) than those that use vehicles

Alcohol Consumption:

The people who “sometimes” consume alcohol were the highest in BMI while ‘always’ was lower than ‘no’
The “sometimes” alcohol consumers having the highest BMI is an interesting pattern that may be explained by effects that alcohol has on the body. Research suggests that moderate alcohol consumption can interfere with fat metabolism because the body prioritizes processing alcohol over burning fat.
Additionally, alcohol consumption is often accompanied by increased calorie intake from both the alcoholic beverages themselves and associated eating behaviors (such as snacking or consuming larger meals).
If you’re a frequent drinker I’d imagine you’d be more likely to spread out your eating
There also is one 21 year old man who apparently is ‘always’ consuming alcohol, here he is:

always_alcohol <- EDA_data %>% 
  filter(CALC == "Always")

print(always_alcohol)

## # A tibble: 1 × 18
##   Gender   Age Height Weight family_history_with_overw…¹ FAVC   FCVC   NCP CAEC 
##   <fct>  <dbl>  <dbl>  <dbl> <fct>                       <fct> <dbl> <dbl> <fct>
## 1 Male      21    1.7     65 yes                         yes       2     1 Freq…
## # ℹ abbreviated name: ¹family_history_with_overweight
## # ℹ 9 more variables: SMOKE <fct>, CH2O <dbl>, SCC <fct>, FAF <dbl>, TUE <dbl>,
## #   CALC <fct>, MTRANS <fct>, NObeyesdad <fct>, BMI <dbl>

He eats 1 meal a day, doesn’t smoke, walks to get around, and uses tech a lot

# --- Collinearity Analysis ---
# --- Collinearity Analysis ---
# Remove BMI column
final_data_no_bmi <- final_data %>% select(-BMI)

# Calculate correlation matrix for numeric columns only
cor_matrix <- final_data_no_bmi %>%
  select(where(is.numeric)) %>%
  cor(use = "complete.obs")  # in case there are any missing values

# Apply readable feature names
rownames(cor_matrix) <- ifelse(rownames(cor_matrix) %in% names(feature_names),
                              feature_names[rownames(cor_matrix)],
                              rownames(cor_matrix))
colnames(cor_matrix) <- ifelse(colnames(cor_matrix) %in% names(feature_names),
                              feature_names[colnames(cor_matrix)],
                              colnames(cor_matrix))

# Plot the correlation matrix
ggcorrplot(cor_matrix,
           method = "square",
           type = "lower",
           lab = TRUE,
           lab_size = 3,
           colors = c("blue", "white", "red"),
           title = "Correlation Matrix (Excluding BMI)",
           ggtheme = theme_minimal())

Features with highest multicollinearity:
- Age & Mode of transportation - 0.57, strong positive (old people are more likely to use automobiles)
- Age & Tech usage - (-0.3), moderate negative (old people have a lower level of tech usage)
- Gender & F&V Consumption - (-0.27), moderate negative (men eat less fruit)
- Gender & Physical Activity - (0.19), moderate positive (men are more frequent in exercising)

############################
# Modelling Preparation
############################
# Design matrix for modeling
full_matrix <- model.matrix(BMI ~ . - 1, data = final_data)
full_label <- final_data$BMI

set.seed(123)
split <- createDataPartition(full_label, p = 0.8, list = FALSE)
train_matrix <- full_matrix[split, ]
test_matrix <- full_matrix[-split, ]
train_label <- full_label[split]
test_label <- full_label[-split]

Model 1: Linear Regression (Kylie)

(I made the calc feature numeric sense it’s ordinal and to allow its p value to be significant enough to draw inference from its coefficient value – now you can say “for each tier increased in alcohol consumption the persons BMI increase 2.45”)

#create linear regression model and print summary
lm_model <- lm(BMI ~ FAF + SCC + CALC, data = final_data)
summary(lm_model)

## 
## Call:
## lm(formula = BMI ~ FAF + SCC + CALC, data = final_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.0101  -5.5762   0.1485   5.8514  20.7423 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  29.6525     0.3640  81.454  < 2e-16 ***
## FAF          -1.4222     0.1975  -7.200 8.32e-13 ***
## SCC          -6.6746     0.8031  -8.311  < 2e-16 ***
## CALC          2.4446     0.3250   7.522 7.96e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.667 on 2107 degrees of freedom
## Multiple R-squared:  0.08551,    Adjusted R-squared:  0.08421 
## F-statistic: 65.68 on 3 and 2107 DF,  p-value: < 2.2e-16

There are 2 significant variables that have an impact on BMI - FAF (how often the person has physical activity) and SCCyes (does the person monitor their calories daily where the answer is yes)
The R^2 value is 0.1045, which is very low - this implies that model explains around 10% of the variance in the model.

# --- Diagnostics ---
ggplot2::autoplot(lm_model)

Residual vs Fitted:

The points are relatively evenly spread around the horizontal line (0), although there are more points above the horizontal line (especially when fitted values = ~30), which implies that the model does a decent job at capturing the linear relationship between BMI and FAF, SCC, and CALC, and errors are mostly random.
There are no odd patterns (curved), which implies that the relationship between BMI and FAF/SCC/CALC is not linear, so a linear regression model will not cover all the complexities of that relationship.
The increase in points as you move from left to right on the graph (as fitted values increase) implies heteroscedasticity, or that the error term does not have constant variance.

Q-Q Plot:

The points fit along the diagonal like for a small portion of the graph, diverging at either end.
The deviation implies that the data is not normally distributed, and might be skewed as well.

# Load the library
library(broom)

# Extract residuals and predictors
resid_data <- augment(lm_model)

# Create each residual plot with custom labels

# FAF (Physical Activity Frequency)
p1 <- ggplot(resid_data, aes(x = FAF, y = .resid)) +
  geom_point(size = 1.5, alpha = 0.5) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  scale_x_continuous(breaks = 0:3,
                     labels = c("0" = "None", "1" = "Low", "2" = "Moderate", "3" = "High")) +
  labs(title = "Residuals by Physical Activity Level (FAF)",
       x = "Physical Activity Frequency", y = "Residuals") +
  theme_minimal()

# SCC (Calories Monitoring)
p2 <- ggplot(resid_data, aes(x = SCC, y = .resid)) +
  geom_point(size = 1.5, alpha = 0.5) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  scale_x_continuous(breaks = 0:1,
                     labels = c("0" = "No", "1" = "Yes")) +
  labs(title = "Residuals by Calories Monitoring (SCC)",
       x = "Monitors Calories?", y = "Residuals") +
  theme_minimal()

# CALC (Alcohol Consumption)
p3 <- ggplot(resid_data, aes(x = CALC, y = .resid)) +
  geom_point(size = 1.5, alpha = 0.5) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  scale_x_continuous(breaks = 0:3,
                     labels = c("0" = "Never", "1" = "Sometimes", "2" = "Frequently", "3" = "Always")) +
  labs(title = "Residuals by Alcohol Consumption (CALC)",
       x = "Alcohol Consumption", y = "Residuals") +
  theme_minimal()

# Combine all 3 vertically
(p1 / p2 / p3)

Looking at the plot for Residuals vs. FAF, the points are relatively evenly distributed around the red horizontal like (y = 0), implying the error is random. However, there is no real curved trend, so the relationship between BMI and FAF is nonlinear.
Looking at the plot for Residual vs. SCC, both medians are close to 0, which means there’s no real bias towards either group in the model. However, the IQR for ‘no’ is much larger than the IQR for ‘yes’, implying heteroscedasticity (violating an assumption of a linear model). There are also outliers for ‘yes’, which could mean that the model does a better job of explaining instances of ‘no’ rather than ‘yes’.
Finally looking at the plot for Residual vs. CALC, all 3 medians are around 0 so nothing in biased, but the IQR for ‘no’ and ‘sometimes’ are much larger than ‘always’ and ‘frequently’, and while there might simply be more observations in those categories, it also could imply that the model has heteroscedasticity and violates assumptions of linear models.

#residual histogram

gg_reshist(lm_model)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The histogram does not have even an approximate bell shape, and it’s not symmetrical around 0 either, implying that the model/data is not normally distributed.

#cook's d

gg_cooksd(lm_model, threshold = 'matlab')

# Define the 5 most influential points
influential_points <- c(27, 69, 258, 381, 472)

# Subset the final data to show only the linear model predictors + BMI
final_data[influential_points, c("FAF", "SCC", "CALC", "BMI")]

Only one influential point was identified, but ideally every point should equally influence the model. This observation could make the conclusions of the model inaccurate, or it might mean the model is overfit and can’t handle new data well.

# Create train/test from original data (not matrix)
train_data_lm <- final_data[split, ]
test_data_lm <- final_data[-split, ]

# Fit model
lm_model <- lm(BMI ~ FAF + SCC + CALC, data = train_data_lm)

# Predict
lr_preds_train <- predict(lm_model, newdata = train_data_lm)
lr_preds_test <- predict(lm_model, newdata = test_data_lm)

# Actuals
actual_train <- train_data_lm$BMI
actual_test <- test_data_lm$BMI

# Metrics
lr_result <- append_model_result("Linear Regression", train_label, lr_preds_train, test_label, lr_preds_test)

# Add to final results
final_model_results <- bind_rows(final_model_results, lr_result)

lr_result

Model 3: Decision Tree (Uma)

library(rpart)
library(rpart.plot)

# Train decision tree (depth limited to 3)
dt_model <- rpart(BMI ~ ., data = final_data[split, ], method = "anova",
                  control = rpart.control(maxdepth = 3))

# Visualize the tree
rpart.plot(dt_model, type = 2, extra = 101, fallen.leaves = TRUE, 
           box.palette = "Blues", main = "Decision Tree (Max Depth = 3)")

We started by training a basic decision tree to predict weight based on all other available variables in our dataset. To keep the model intentionally simple and interpretable, we restricted the tree’s complexity in two ways:

Limited the maximum depth to only 3 levels (preventing overcomplicated branches) Set a relatively high complexity parameter (cp=0.01) to prune unnecessary splits The ‘anova’ method specification tells R we’re performing regression (predicting continuous weight values) rather than classification.

For visualization, we created a clean, informative plot showing:

The hierarchical decision points (splits) based on important predictors The predicted weight values at each terminal node (leaf) A color gradient (using a blue palette) to help distinguish between different prediction ranges Subtle drop shadows to improve readability of the tree structure

This restrained approach gives us a model that’s: 1. Easy to explain to non-technical stakeholders 2. Quick to train and evaluate 3. Provides a baseline for comparing with more complex models later

The visualization serves as both a diagnostic tool (checking if splits make logical sense) and a communication aid (showing how different factors combine to influence weight predictions).

# Predict on train and test
dt_preds_train <- predict(dt_model, newdata = final_data[split, ])
dt_preds_test <- predict(dt_model, newdata = final_data[-split, ])

# Actual values
actual_train <- final_data$BMI[split]
actual_test <- final_data$BMI[-split]

# Calculate metrics
dt_result <- append_model_result("Decision Tree", train_label, dt_preds_train, test_label, dt_preds_test)

# Print summary
print(dt_result)

## # A tibble: 1 × 7
##   Model         RMSE_Train RMSE_Test MAE_Train MAE_Test R2_Train R2_Test
##   <chr>              <dbl>     <dbl>     <dbl>    <dbl>    <dbl>   <dbl>
## 1 Decision Tree       5.63      5.62      4.47     4.46    0.504   0.513

# Save for final comparison (append later)
final_model_results <- bind_rows(final_model_results, dt_result)

We used our simple decision tree to predict weights in two situations:

For people it already knew (training data) For new people it hadn’t seen before (test data) Then we calculated how far off our guesses were using RMSE (basically, how many pounds we’re off on average):

First we made predictions:

train_pred = guesses for our original group test_pred = guesses for the new group

Then we checked accuracy:

Squared all the differences between guesses and real weights Took the average and then the square root (that’s RMSE) This gives us one number showing typical error size

Finally we printed clean results:

Training error: Shows how well it learned the patterns Testing error: Shows how well it works on new people The printout gives us numbers like “12.34” meaning we’re typically about 12 pounds off in our weight guesses. Smaller numbers mean better predictions!

This helps us understand:

If our model is working properly Whether it’s guessing better for known vs new people How accurate we can expect it to be in real use

# Extract and format variable importance for Decision Tree
dt_importance <- tibble(
  Feature = names(dt_model$variable.importance),
  Importance = as.numeric(dt_model$variable.importance)
) %>%
  arrange(desc(Importance))

# View top features (e.g., top 10)
print(head(dt_importance, 10))

## # A tibble: 10 × 2
##    Feature                        Importance
##    <chr>                               <dbl>
##  1 family_history_with_overweight     24987.
##  2 FCVC                               13008.
##  3 CAEC                               10969.
##  4 Age                                 8078.
##  5 MTRANS                              3366.
##  6 TUE                                 2893.
##  7 Gender                              1114.
##  8 CH2O                                1112.
##  9 CALC                                1062.
## 10 FAF                                  196.

Understanding What Really Affects Weight Predictions

After building our weight-prediction model, we wanted to know which factors mattered most. Here’s how we checked:

We extracted the “importance scores” from our decision tree model These scores show how much each characteristic (like age or height) helped make accurate predictions The more a variable was used to split the data, the higher its score

We organized this information neatly in a table with:

Column 1: The factor name (e.g., “Height”, “Age”) Column 2: Its importance score (higher numbers = more important)

We used the kable() function to display this as a clean, professional-looking table Added a clear title explaining what the table shows Made it easy to compare which factors are most influential

This helps us answer questions like:

• What lifestyle factors most affect weight predictions? • Should we focus on collecting more detailed data about certain factors? • Do the important variables make logical sense?

Model 2: XgBoost (Carson)

############################
# XGBoost Tuning Functions
############################

tune_xgboost_param <- function(param_name, param_values, fixed_params) {
  results <- tibble(
    !!param_name := numeric(),
    RMSE = numeric(),
    MAE = numeric(),
    R2 = numeric()
  )
  
  for (val in param_values) {
    # Set up the dynamic parameter list
    params <- modifyList(fixed_params, setNames(list(val), param_name))
    
    # Extract all parameters with defaults for missing ones
    max_depth <- params$max_depth %||% 6
    eta <- params$eta %||% 0.3
    nrounds <- params$nrounds %||% 100
    subsample <- params$subsample %||% 1
    colsample_bytree <- params$colsample_bytree %||% 1
    min_child_weight <- params$min_child_weight %||% 1
    
    # Train model with all parameters
    model <- xgboost(
      data = train_matrix,
      label = train_label,
      objective = "reg:squarederror",
      max_depth = max_depth,
      eta = eta,
      nrounds = nrounds,
      subsample = subsample,
      colsample_bytree = colsample_bytree,
      min_child_weight = min_child_weight,
      verbose = 0
    )
    
    # Predict and evaluate
    preds <- predict(model, newdata = test_matrix)
    rmse <- sqrt(mean((preds - test_label)^2))
    mae <- mean(abs(preds - test_label))
    r2 <- 1 - sum((preds - test_label)^2) / sum((mean(test_label) - test_label)^2)
    
    # Append to results
    results <- add_row(results,
                       !!param_name := val,
                       RMSE = round(rmse, 3),
                       MAE = round(mae, 3),
                       R2 = round(r2, 3))
  }
  
  return(results)
}

# Helper function for NULL handling (equivalent to %||% in tidyverse)
"%||%" <- function(x, y) if (is.null(x)) y else x

# plot the results & and identify optimal level

plot_xgb_metric_curve <- function(results_df, hyperparam = "max_depth") {
  # Find optimal row based on lowest RMSE
  optimal_row <- results_df[which.min(results_df$RMSE), ]
  optimal_value <- optimal_row[[hyperparam]]

  # Reshape for plotting
  results_long <- results_df %>%
    pivot_longer(cols = c(RMSE, MAE), names_to = "Metric", values_to = "value")

  # Plot
  ggplot(results_long, aes(x = .data[[hyperparam]], y = value, color = Metric)) +
    geom_line(size = 1.2) +
    geom_point() +
    geom_vline(xintercept = optimal_value, linetype = "dashed", color = "red", size = 1) +
    annotate("text",
           x = optimal_value + 0.015,
           y = median(results_long$value) + .5,  # More stable than max()
           label = "Optimal",
           color = "red",
           angle = 90,
           vjust = -0.2,
           size = 4) +
    theme_minimal() +
    labs(
      title = paste("Model Performance by", gsub("_", " ", tools::toTitleCase(hyperparam))),
      x = tools::toTitleCase(gsub("_", " ", hyperparam)),
      y = "Metric Value"
    )
}

############################
# Depth Tuning (eta = 0.15, nrounds = 100)
############################

depth_results <- tune_xgboost_param("max_depth", seq(3, 12, by = 1),
                                  fixed_params = list(eta = 0.15, nrounds = 100))
depth_plot <- plot_xgb_metric_curve(depth_results, hyperparam = "max_depth")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

depth_results

depth_plot

Based on this we’ll fix the depth to 8

# ######################################
# Eta Tuning (max_depth = 8, nrounds = 100)
# ######################################

eta_results <- tune_xgboost_param("eta", 
                                  seq(0.05, 0.3, by = 0.025),
                                  fixed_params = list(max_depth = 8, nrounds = 100))

eta_plot <- plot_xgb_metric_curve(eta_results, hyperparam = "eta")

eta_results

eta_plot

Based on this we’ll fix the eta to 0.15

# ##############################################
# nrounds Tuning (max_depth = 8, eta = 0.15)
# ##############################################

nrounds_results <- tune_xgboost_param(
  "nrounds",
  seq(20, 300, by = 20),
  fixed_params = list(max_depth = 8, eta = 0.15)
)

nrounds_plot <- plot_xgb_metric_curve(nrounds_results, hyperparam = "nrounds")

nrounds_results

nrounds_plot

- we’ll set this to the minimum of 80

############################
# Subsample Tuning (max_depth = 8, eta = 0.15, nrounds = 80)
############################

subsample_results <- tune_xgboost_param(
  "subsample",
  seq(0.5, 1.0, by = 0.05),
  fixed_params = list(max_depth = 8, eta = 0.15, nrounds = 80)
)

subsample_plot <- plot_xgb_metric_curve(subsample_results, hyperparam = "subsample")

subsample_results

subsample_plot

############################
# Colsample_bytree Tuning (max_depth = 8, eta = 0.15, nrounds = 80, subsample = 0.7)
############################

# Use the optimal subsample value from previous tuning
optimal_subsample <- subsample_results[which.min(subsample_results$RMSE), "subsample"]

colsample_results <- tune_xgboost_param(
  "colsample_bytree",
  seq(0.5, 1, by = 0.05),
  fixed_params = list(max_depth = 8, eta = 0.15, nrounds = 80, subsample = optimal_subsample)
)

colsample_plot <- plot_xgb_metric_curve(colsample_results, hyperparam = "colsample_bytree")

colsample_results

colsample_plot

############################
# Min_child_weight Tuning (using all optimal parameters from previous tuning)
############################

# Get optimal colsample value
optimal_colsample <- colsample_results[which.min(colsample_results$RMSE), "colsample_bytree"]

min_child_results <- tune_xgboost_param(
  "min_child_weight",
  seq(1, 15, by = 1),
  fixed_params = list(
    max_depth = 8, 
    eta = 0.15, 
    nrounds = 80, 
    subsample = optimal_subsample,
    colsample_bytree = optimal_colsample
  )
)

min_child_plot <- plot_xgb_metric_curve(min_child_results, hyperparam = "min_child_weight")

min_child_results

min_child_plot

these last 3 hyper parameter had a lot of variation in their results
there is massive list of hyperparameters to tune, but i’m going to stop with these 6

############################
# Final Tuned XGBoost Model
############################

set.seed(4)

# Train final model with manually specified hyperparameters
final_model <- xgboost(
  data = train_matrix,
  label = train_label,
  objective = "reg:squarederror",
  booster = "gbtree",
  max_depth = 8,     
  eta = 0.15,          
  nrounds = 80,             
  subsample = 0.7,          
  colsample_bytree = 0.8,   
  min_child_weight = 10,     
  verbose = 0
)

# Evaluate XGBoost on both train and test sets
xgb_preds_train <- predict(final_model, newdata = train_matrix)
xgb_preds_test <- predict(final_model, newdata = test_matrix)

# Calculate metrics
xgb_results <- append_model_result("XGBoost (Fully Tuned)", train_label, xgb_preds_train, test_label, xgb_preds_test)

# Print and store
print(xgb_results)

## # A tibble: 1 × 7
##   Model                 RMSE_Train RMSE_Test MAE_Train MAE_Test R2_Train R2_Test
##   <chr>                      <dbl>     <dbl>     <dbl>    <dbl>    <dbl>   <dbl>
## 1 XGBoost (Fully Tuned)       1.55      2.78      1.03     1.95    0.963   0.881

final_model_results <- bind_rows(final_model_results, xgb_results)

XGBoost model significantly outperforms the linear model:
RMSE dropped from 7.59 → 2.775
MAE was also very low at ~1.9, suggesting consistent predictions

# Feature importance using final_model (Based on gain)
importance_matrix <- xgb.importance(model = final_model, feature_names = colnames(train_matrix))

# View as table
print(importance_matrix)

##                            Feature         Gain       Cover   Frequency
##  1: family_history_with_overweight 0.2317319372 0.036311285 0.027027027
##  2:                            Age 0.1478991272 0.217379167 0.187803188
##  3:                           FCVC 0.1161589515 0.120586196 0.112959113
##  4:                           CAEC 0.1123519260 0.077270439 0.042273042
##  5:                            TUE 0.0883558831 0.096562805 0.127165627
##  6:                            NCP 0.0810737564 0.083921494 0.085239085
##  7:                            FAF 0.0711779422 0.134785211 0.160429660
##  8:                         MTRANS 0.0463231355 0.035653456 0.025641026
##  9:                         Gender 0.0347175730 0.027308082 0.039501040
## 10:                           CH2O 0.0341372971 0.103555397 0.127512128
## 11:                           CALC 0.0245622328 0.027784749 0.035689536
## 12:                           FAVC 0.0066940218 0.015430032 0.018018018
## 13:                            SCC 0.0044059539 0.015985400 0.008662509
## 14:                          SMOKE 0.0004102622 0.007466288 0.002079002

# Optional: Plot top 15 features
xgb.plot.importance(
  importance_matrix,
  top_n = 15,
  measure = "Gain",
  rel_to_first = TRUE,
  xlab = "Relative Importance"
)

# Step 1: Prepare DMatrix for SHAP
dtrain <- xgb.DMatrix(data = train_matrix, label = train_label)

# Step 2: Compute SHAP values
shap_values <- shap.values(xgb_model = final_model, X_train = train_matrix)

# Step 3: Create SHAP summary
shap_summary <- shap_values$mean_shap_score
shap_importance <- data.frame(
  Feature = names(shap_summary),
  Mean_SHAP = shap_summary
) |>
  arrange(desc(Mean_SHAP))

# View top 10 SHAP features
head(shap_importance, 10)

Strong agreement across SHAP and Gain rankings suggests these top features are truly valuable (especially family history, FCVC, and Age).
SHAP provides extra context: it shows that even features like CH2O and Gender that aren’t frequently split on still influence predictions consistently.
Gain shows how much each split helps reduce error, so it’s more tree-structure specific.
SHAP captures the total effect on the model’s prediction output, even if the feature is only used in subtle ways.

final_model_results %>%
  arrange(RMSE_Test) %>%
  kable(caption = "Model Comparison: Sorted by Test RMSE")

Model Comparison: Sorted by Test RMSE
Model	RMSE_Train	RMSE_Test	MAE_Train	MAE_Test	R2_Train	R2_Test
XGBoost (Fully Tuned)	1.546	2.775	1.034	1.952	0.963	0.881
Decision Tree	5.634	5.615	4.469	4.461	0.504	0.513
Linear Regression	7.631	7.777	6.390	6.416	0.090	0.065

# --- Step 1: Rank features for each model ---

# XGBoost top 10 features
xgb_top <- xgb.importance(model = final_model, feature_names = colnames(train_matrix)) %>%
  slice_max(Gain, n = 10) %>%
  mutate(Rank = row_number()) %>%
  select(Rank, XGBoost = Feature)

# Decision Tree top 10 features (coerce to tibble explicitly)
dt_top <- tibble(
  Feature = names(dt_model$variable.importance),
  Importance = as.numeric(dt_model$variable.importance)
) %>%
  arrange(desc(Importance)) %>%
  dplyr::slice(1:10) %>%
  mutate(Rank = row_number()) %>%
  select(Rank, Decision_Tree = Feature)


# Linear Regression top 3 features based on t-stat
lm_summary <- summary(lm_model)
lr_top <- tibble(
  Feature = rownames(lm_summary$coefficients),
  T_stat = lm_summary$coefficients[, "t value"]
) %>%
  filter(Feature != "(Intercept)") %>%
  arrange(desc(abs(T_stat))) %>%
  mutate(Rank = row_number()) %>%
  dplyr::slice(1:3) %>%
  select(Rank, Linear_Regression = Feature)

# --- Step 2: Merge all rankings into a single table ---

# Create a full Rank index (1 to 10)
rank_frame <- tibble(Rank = 1:10)

# Join all ranking tables
feature_rankings <- rank_frame %>%
  left_join(xgb_top, by = "Rank") %>%
  left_join(dt_top, by = "Rank") %>%
  left_join(lr_top, by = "Rank")

# --- Step 3: Print clean table ---
library(knitr)
kable(feature_rankings, caption = "Top Feature Rankings Across Models")

Top Feature Rankings Across Models
Rank	XGBoost	Decision_Tree	Linear_Regression
1	family_history_with_overweight	family_history_with_overweight	SCC
2	Age	FCVC	CALC
3	FCVC	CAEC	FAF
4	CAEC	Age	NA
5	TUE	MTRANS	NA
6	NCP	TUE	NA
7	FAF	Gender	NA
8	MTRANS	CH2O	NA
9	Gender	CALC	NA
10	CH2O	FAF	NA

Conclusion:

Methods - Describe building a shallow Decision Tree with depth = 3, reason for balance between interpretability and prediction. Results - Report Training RMSE, Testing RMSE, include the Decision Tree plot (Figure 1), and Variable Importance (Table 1). Discussion - Discuss Decision Trees being simple to interpret but sometimes underfitting complex patterns.

Comparison

1. Linear Regression

Pros:
- Interpretability: Linear regression is easily interpretable. The coefficients directly show the relationship between each predictor and the outcome variable
- Simplicity: It’s a simple model to understand/implement
Cons:
- Not for predicting: In this model, the R^2 of 0.105 indicates very low predictive power (the model explains only a small portion of the variance in BMI)
- Violated Linear Assumptions: Linear regression relies on several assumptions (linearity, independence, homoscedasticity, normality of errors), which were violated based on my diagnostic plots
- Not Complex: It can only model linear relationships, and this data is not linearly related, so it misses complexities in the data

2. XGBoost

Pros:
- Good for predicting: XGBoost outperformed linear regression with a lower RMSE and MAE. It better captures complex relationships and makes good predictions
- Handles Nonlinear relations: XGBoost can handle/capture the nonlinear relationship that is present between variables
Cons:
- Interpretability: XGBoost models are harder to interpret that linear regression
- Complexity: XGBoost has many hyperparameters that require tuning

3. Decision Tree

Pros:
- Interpretability: Decision trees are relatively easy to interpret, especially shallow trees like the one we used
- Visualization: Decision trees are easily visualized so we can see how the model works
Cons:
- Not great at predicting: Shallow decision trees don’t predict as well as XGBoost
- Potential Underfitting: Shallow trees have the potential to underfit the data, so it doesn’t get all the patterns in the data

For our case:

The linear regression model is probably inadequate because the data is nonlinear, so it can’t accurately capture the relationship, and also the assumptions of linear models are violated, so the model is not ‘good’
The XGBoost model is probably the best - it has a higher predictive power and better model results, although it’s more difficult to interpret than linear regression or decision tree

Stats final project

2025-04-28

Load/Clean Data (Carson)

EDA (Carson)

Interpretation:

Interpretaion:

Interpretation:

Interpretation:

Model 1: Linear Regression (Kylie)

Model 3: Decision Tree (Uma)

Model 2: XgBoost (Carson)

Comparison