Load/Clean Data (Carson)

#loading libraries

library(readr) 
library(dplyr) 
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggthemes)
library(ggrepel)
## Loading required package: ggplot2
library(lindia)
library(ggcorrplot)
library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library(tibble)
library(caret)
## Loading required package: lattice
library(SHAPforxgboost)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(rpart)
library(rpart.plot)
library(knitr)
# Load dataset
obesity <- read_csv("ObesityDataSet_raw_and_data_sinthetic.csv")
## Rows: 2111 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Gender, family_history_with_overweight, FAVC, CAEC, SMOKE, SCC, CAL...
## dbl (8): Age, Height, Weight, FCVC, NCP, CH2O, FAF, TUE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
obesity <- obesity %>%
  mutate(BMI = Weight / (Height ^ 2))

# Remove unwanted columns
obesity_clean <- obesity %>%
  select(-Height, -Weight, -NObeyesdad)

# Convert categorical variables to numeric (with ordering where appropriate)
obesity_clean <- obesity_clean %>%
  mutate(
    # Binary factors to 0/1
    Gender = if_else(Gender == "Female", 0, 1),
    family_history_with_overweight = if_else(family_history_with_overweight == "no", 0, 1),
    FAVC = if_else(FAVC == "no", 0, 1),
    CAEC = case_when(
      CAEC == "no" ~ 0,
      CAEC == "Sometimes" ~ 1,
      CAEC == "Frequently" ~ 2,
      CAEC == "Always" ~ 3
    ),
    SMOKE = if_else(SMOKE == "no", 0, 1),
    SCC = if_else(SCC == "no", 0, 1),
    CALC = case_when(
      CALC == "no" ~ 0,
      CALC == "Sometimes" ~ 1,
      CALC == "Frequently" ~ 2,
      CALC == "Always" ~ 3
    ),
    MTRANS = case_when(
      MTRANS == "Walking" ~ 0,
      MTRANS == "Bike" ~ 1,
      MTRANS == "Public_Transportation" ~ 2,
      MTRANS == "Motorbike" ~ 3,
      MTRANS == "Automobile" ~ 4
    )
  )

Model 1: Linear Regression (Kylie)

#create linear regression model and print summary

lrmodel = lm(BMI ~ FAF + SCC + CALC, obesity)

summary(lrmodel)
## 
## Call:
## lm(formula = BMI ~ FAF + SCC + CALC, data = obesity)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.4201  -5.5022   0.2173   5.7866  20.0852 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     23.7806     7.5927   3.132  0.00176 ** 
## FAF             -1.2893     0.1966  -6.558 6.83e-11 ***
## SCCyes          -6.3778     0.7966  -8.007 1.93e-15 ***
## CALCFrequently   5.5560     7.6449   0.727  0.46745    
## CALCno           5.1129     7.5963   0.673  0.50097    
## CALCSometimes    8.7095     7.5930   1.147  0.25149    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.59 on 2105 degrees of freedom
## Multiple R-squared:  0.1045, Adjusted R-squared:  0.1024 
## F-statistic: 49.13 on 5 and 2105 DF,  p-value: < 2.2e-16
#create diagnostic plots for the model

#residual vs fitted values

gg_resfitted(lrmodel) +
  geom_smooth(se=FALSE)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

#residuals vs xaxis

plots = gg_resX(lrmodel, plot.all = TRUE)

#residual histogram

gg_reshist(lrmodel)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#QQ-Plot

gg_qqplot(lrmodel)

#cook's d

gg_cooksd(lrmodel, threshold = 'matlab')

# Make predictions on the test set
lr_preds <- predict(lrmodel, newdata = obesity)

# Calculate MSE
lr_mse <- mean((lr_preds - obesity$BMI) ^ 2)
cat("Linear Regression Performance:\n")
## Linear Regression Performance:
cat("Mean Squared Error (MSE):", round(lr_mse, 2), "\n")
## Mean Squared Error (MSE): 57.45
# Calculate R-squared
lr_r_squared <- summary(lrmodel)$r.squared
cat("R-squared:", round(lr_r_squared, 3), "\n\n")
## R-squared: 0.105
# Coefficients
cat("Linear Regression Coefficients:\n")
## Linear Regression Coefficients:
coefficients <- coef(lrmodel)
print(coefficients)
##    (Intercept)            FAF         SCCyes CALCFrequently         CALCno 
##      23.780631      -1.289282      -6.377826       5.556040       5.112922 
##  CALCSometimes 
##       8.709504

Model 2: XgBoost (Carson)

# Create numeric correlation matrix from cleaned dataset
cor_matrix <- obesity_clean %>%
  select(where(is.numeric)) %>%
  cor()

# Plot the correlation matrix visually
ggcorrplot(cor_matrix,
           hc.order = TRUE,              # Cluster similar variables
           type = "upper",              # Show upper triangle only
           lab = TRUE,                  # Show correlation values inside squares
           lab_size = 3,
           method = "square",           # Use colored squares
           colors = c("blue", "white", "red"),  # Blue = negative, red = positive
           title = "Correlation Matrix",
           ggtheme = theme_minimal())

# Prepare full matrix and labels again
obesity_model_data <- obesity |>
  mutate(BMI = Weight / (Height^2)) |>
  select(-Weight, -Height, -NObeyesdad)

full_matrix <- model.matrix(BMI ~ . -1, data = obesity_model_data)
full_label <- obesity_model_data$BMI

# Train/test split
set.seed(123)
split <- createDataPartition(full_label, p = 0.8, list = FALSE)
train_matrix <- full_matrix[split, ]
test_matrix <- full_matrix[-split, ]
train_label <- full_label[split]
test_label <- full_label[-split]

# Hyperparameter grid to test
param_grid <- tribble(
  ~max_depth, ~eta, ~nrounds,
  3,  0.1, 100,
  4,  0.05, 150,
  5,  0.2, 75,
  6,  0.1, 100,
  3,  0.3, 50,
  5,  0.05, 200,
  4,  0.2, 80,
  6,  0.3, 60,
  2,  0.15, 120,
  7,  0.1, 100
)

# Empty results table
# Initialize empty results tibble with column names
results <- tibble(
  max_depth = numeric(),
  eta = numeric(),
  nrounds = numeric(),
  RMSE = numeric(),
  MAE = numeric()
)


# Loop through hyperparameter combos
for (i in 1:nrow(param_grid)) {
  params <- param_grid[i, ]
  
  model <- xgboost(data = train_matrix,
                   label = train_label,
                   objective = "reg:squarederror",
                   max_depth = params$max_depth,
                   eta = params$eta,
                   nrounds = params$nrounds,
                   verbose = 0)
  
  preds <- predict(model, newdata = test_matrix)
  rmse <- sqrt(mean((preds - test_label)^2))
  mae <- mean(abs(preds - test_label))
  
  results <- results |> 
    add_row(
      max_depth = params$max_depth,
      eta = params$eta,
      nrounds = params$nrounds,
      RMSE = round(rmse, 3),
      MAE = round(mae, 3)
    )
}

# View the results
print(results)
## # A tibble: 10 × 5
##    max_depth   eta nrounds  RMSE   MAE
##        <dbl> <dbl>   <dbl> <dbl> <dbl>
##  1         3  0.1      100  3.74  2.89
##  2         4  0.05     150  3.53  2.64
##  3         5  0.2       75  3.05  2.11
##  4         6  0.1      100  2.98  2.01
##  5         3  0.3       50  3.59  2.76
##  6         5  0.05     200  3.17  2.27
##  7         4  0.2       80  3.27  2.34
##  8         6  0.3       60  3.20  2.12
##  9         2  0.15     120  4.10  3.19
## 10         7  0.1      100  2.86  1.90
# Step 2: Create design matrix BEFORE splitting (ensures same columns)
full_matrix <- model.matrix(BMI ~ . -1, data = obesity)
full_label <- obesity$BMI

# Step 3: Split indices
set.seed(123)
split <- createDataPartition(full_label, p = 0.8, list = FALSE)

train_matrix <- full_matrix[split, ]
test_matrix <- full_matrix[-split, ]
train_label <- full_label[split]
test_label <- full_label[-split]

# Step 4: Train XGBoost
xgb_model <- xgboost(data = train_matrix,
                     label = train_label,
                     objective = "reg:squarederror",
                     nrounds = 100,
                     eta = 0.1,
                     max_depth = 4,
                     verbose = 0)

# Step 5: Predict and evaluate
preds <- predict(xgb_model, newdata = test_matrix)
rmse <- sqrt(mean((preds - test_label)^2))
mae <- mean(abs(preds - test_label))

cat("XGBoost Model Performance:\n")
## XGBoost Model Performance:
cat("RMSE:", round(rmse, 2), "\n")
## RMSE: 0.46
cat("MAE:", round(mae, 2), "\n")
## MAE: 0.35
# Get feature importance
importance_matrix <- xgb.importance(model = xgb_model, feature_names = colnames(train_matrix))

# View as a table
print(importance_matrix)
##                               Feature         Gain        Cover    Frequency
##                                <char>        <num>        <num>        <num>
##  1:                            Weight 8.723899e-01 4.507027e-01 0.3397982933
##  2:        NObeyesdadObesity_Type_III 6.171327e-02 4.628121e-02 0.0271528317
##  3:          NObeyesdadObesity_Type_I 2.845493e-02 2.598048e-02 0.0240496509
##  4:                            Height 1.680086e-02 2.598545e-01 0.2979053530
##  5:           NObeyesdadNormal_Weight 8.824435e-03 1.677272e-02 0.0193948798
##  6:         NObeyesdadObesity_Type_II 5.647037e-03 2.227027e-02 0.0217222653
##  7:     NObeyesdadOverweight_Level_II 4.283408e-03 8.278699e-03 0.0108611327
##  8:                               FAF 4.804601e-04 1.759939e-02 0.0302560124
##  9:                               Age 3.334638e-04 3.541109e-02 0.0853374709
## 10:      NObeyesdadOverweight_Level_I 3.238365e-04 2.406213e-03 0.0046547711
## 11:                              CH2O 3.143823e-04 3.996452e-02 0.0387897595
## 12:                               TUE 1.112437e-04 1.774394e-02 0.0333591932
## 13:                      GenderFemale 1.082058e-04 2.561157e-02 0.0116369279
## 14:                              FCVC 7.889738e-05 9.043628e-03 0.0209464701
## 15:                               NCP 4.238821e-05 1.109298e-02 0.0131885182
## 16:                            CALCno 3.334350e-05 1.573525e-03 0.0046547711
## 17:       MTRANSPublic_Transportation 1.928270e-05 2.120118e-03 0.0015515904
## 18:                     CAECSometimes 1.528758e-05 5.167787e-03 0.0054305663
## 19:                     CALCSometimes 8.003848e-06 4.472124e-04 0.0007757952
## 20: family_history_with_overweightyes 6.081511e-06 6.489850e-04 0.0023273856
## 21:                    CAECFrequently 5.405025e-06 9.215285e-04 0.0031031808
## 22:                           FAVCyes 4.800217e-06 7.829981e-05 0.0023273856
## 23:                     MTRANSWalking 1.078721e-06 2.860955e-05 0.0007757952
##                               Feature         Gain        Cover    Frequency
# Optional: Plot top features
xgb.plot.importance(importance_matrix, top_n = 15, 
                    measure = "Gain", 
                    rel_to_first = TRUE, 
                    xlab = "Relative Importance")

# Step 1: Convert train matrix to DMatrix (needed for SHAP)
dtrain <- xgb.DMatrix(data = train_matrix, label = train_label)

# Step 2: Compute SHAP values
shap_values <- shap.values(xgb_model = xgb_model, X_train = train_matrix)

# Step 3: Get SHAP importance summary
shap_summary <- shap_values$mean_shap_score
shap_importance <- data.frame(
  Feature = names(shap_summary),
  Mean_SHAP = shap_summary
) |>
  arrange(desc(Mean_SHAP))

# View top features by SHAP
head(shap_importance, 10)

Model 3: Decision Tree (Uma)

# Convert character variables to factors
obesity_data <- obesity %>% 
  mutate(across(where(is.character), as.factor))

# Remove any missing values
obesity_data <- na.omit(obesity_data)

# Split the data into training and testing sets (80/20 split)
set.seed(123)
train_index <- createDataPartition(obesity_data$Weight, p = 0.8, list = FALSE)
train_data <- obesity_data[train_index, ]
test_data <- obesity_data[-train_index, ]
# Train a shallow decision tree (maximum depth = 3)
tree_model <- rpart(Weight ~ ., 
                    data = train_data, 
                    method = "anova", 
                    control = rpart.control(maxdepth = 3, cp = 0.01))

# Plot the decision tree
rpart.plot(tree_model,
           type = 4,
           extra = 101,
           fallen.leaves = TRUE,
           box.palette = "Blues",
           shadow.col = "gray")

We started by training a basic decision tree to predict weight based on all other available variables in our dataset. To keep the model intentionally simple and interpretable, we restricted the tree’s complexity in two ways:

Limited the maximum depth to only 3 levels (preventing overcomplicated branches) Set a relatively high complexity parameter (cp=0.01) to prune unnecessary splits The ‘anova’ method specification tells R we’re performing regression (predicting continuous weight values) rather than classification.

For visualization, we created a clean, informative plot showing:

The hierarchical decision points (splits) based on important predictors The predicted weight values at each terminal node (leaf) A color gradient (using a blue palette) to help distinguish between different prediction ranges Subtle drop shadows to improve readability of the tree structure

This restrained approach gives us a model that’s: 1. Easy to explain to non-technical stakeholders 2. Quick to train and evaluate 3. Provides a baseline for comparing with more complex models later

The visualization serves as both a diagnostic tool (checking if splits make logical sense) and a communication aid (showing how different factors combine to influence weight predictions).

# Predict on training and test data
train_pred <- predict(tree_model, newdata = train_data)
test_pred <- predict(tree_model, newdata = test_data)

# Calculate RMSE (Root Mean Squared Error)
train_rmse <- sqrt(mean((train_pred - train_data$Weight)^2))
test_rmse <- sqrt(mean((test_pred - test_data$Weight)^2))

# Print RMSE neatly
cat("Training RMSE:", round(train_rmse, 2), "\n")
## Training RMSE: 6.86
cat("Testing RMSE:", round(test_rmse, 2), "\n")
## Testing RMSE: 7.54

We used our simple decision tree to predict weights in two situations:

For people it already knew (training data) For new people it hadn’t seen before (test data) Then we calculated how far off our guesses were using RMSE (basically, how many pounds we’re off on average):

First we made predictions:

train_pred = guesses for our original group test_pred = guesses for the new group

Then we checked accuracy:

Squared all the differences between guesses and real weights Took the average and then the square root (that’s RMSE) This gives us one number showing typical error size

Finally we printed clean results:

Training error: Shows how well it learned the patterns Testing error: Shows how well it works on new people The printout gives us numbers like “12.34” meaning we’re typically about 12 pounds off in our weight guesses. Smaller numbers mean better predictions!

This helps us understand:

If our model is working properly Whether it’s guessing better for known vs new people How accurate we can expect it to be in real use

# Check variable importance
importance <- data.frame(
  Variable = names(tree_model$variable.importance),
  Importance = as.numeric(tree_model$variable.importance)
)

# Display neatly
kable(importance, caption = "Variable Importance in Decision Tree Model")
Variable Importance in Decision Tree Model
Variable Importance
BMI 1030235.6099
NObeyesdad 944994.3397
Age 276867.2133
family_history_with_overweight 167752.3324
CH2O 127656.6369
CAEC 127294.8459
Height 69168.5791
FCVC 48378.3335
NCP 45096.6393
CALC 30338.9549
Gender 21633.7995
FAF 20638.6060
TUE 9871.7337
SCC 877.1694
MTRANS 774.6290

Understanding What Really Affects Weight Predictions

After building our weight-prediction model, we wanted to know which factors mattered most. Here’s how we checked:

We extracted the “importance scores” from our decision tree model These scores show how much each characteristic (like age or height) helped make accurate predictions The more a variable was used to split the data, the higher its score

We organized this information neatly in a table with:

Column 1: The factor name (e.g., “Height”, “Age”) Column 2: Its importance score (higher numbers = more important)

We used the kable() function to display this as a clean, professional-looking table Added a clear title explaining what the table shows Made it easy to compare which factors are most influential

This helps us answer questions like:

• What lifestyle factors most affect weight predictions? • Should we focus on collecting more detailed data about certain factors? • Do the important variables make logical sense?

Conclusion:

Methods - Describe building a shallow Decision Tree with depth = 3, reason for balance between interpretability and prediction. Results - Report Training RMSE, Testing RMSE, include the Decision Tree plot (Figure 1), and Variable Importance (Table 1). Discussion - Discuss Decision Trees being simple to interpret but sometimes underfitting complex patterns.

Comparison

1. Linear Regression

2. XGBoost

3. Decision Tree

For our case: