#========================================================= #COMPARE MACHINE LEARNING PIPELINES #Outcome: Average systolic blood pressure #input var: #avg_systolic, # Outcome variable: average systolic BP #chocolate, # Chocolate intake group: 0 = No, 1 = Yes #choc_grams, # Chocolate/cocoa grams #RIDAGEYR, # Age #RIAGENDR, # Gender #DR1TKCAL, # Total calories #DR1TSUGR, # Total sugar #DR1TTFAT, # Total fat #DR1TPROT, # Total protein #DR1TCARB # Total carbohydrates #output var: #Models: #1. Linear Regression #2. Random Forest Regression #3. Gradient Boosting Regression #=========================================================

library(httr)
library(haven)
## Warning: package 'haven' was built under R version 4.5.3
url <- "https://wwwn.cdc.gov/Nchs/Nhanes/2017-20121/DEMO_J.XPT"
dest <- file.path(tempdir(), "DEMO_J.XPT")

res <- GET(
  url,
  user_agent("R"),
  write_disk(dest, overwrite = TRUE)
)
print(file.info(dest)$size)
## [1] 20905

#LOAD REQUIRED LIBRARIES

library(dplyr)      # Used for data cleaning, grouping, and summarizing
## Warning: package 'dplyr' was built under R version 4.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)    # Used for creating visualizations
## Warning: package 'ggplot2' was built under R version 4.5.3
library(psych)      # Used for descriptive statistics
## Warning: package 'psych' was built under R version 4.5.3
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(broom)      # Used to convert model results into clean tables
## Warning: package 'broom' was built under R version 4.5.3
library(nhanesA)      # Download NHANES datasets directly in R
## Warning: package 'nhanesA' was built under R version 4.5.3
library(dplyr)        # Data manipulation
library(caret)        # Train/test split + preprocessing + ML utilities
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:httr':
## 
##     progress
library(randomForest) # Random Forest model
## Warning: package 'randomForest' was built under R version 4.5.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
## 
##     outlier
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(pROC)         # ROC and AUC evaluation
## Warning: package 'pROC' was built under R version 4.5.3
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

#2. DOWNLOAD NHANES DATA (REAL DATA) Datasets

DEMO_J   <- nhanes("DEMO_J")    # Demographic data
BPX_J    <- nhanes("BPX_J")     # Blood pressure examination data
DR1TOT_J <- nhanes("DR1TOT_J")  # Day 1 total nutrient intake
DR1IFF_J <- nhanes("DR1IFF_J")  # Day 1 individual foods file

dim(DR1IFF_J)
## [1] 112683     84

#CREATE CHOCOLATE INDICATOR

chocolate_codes <- c(
  53111000, 53113000, 53114000, 53201000, 53202000,
  53203000, 53233060, 54101000, 54102000, 92530010
)

# Filter food-level data to rows representing chocolate-related foods
chocolate_data <- DR1IFF_J %>%
  filter(DR1IFDCD %in% chocolate_codes) %>%   # Keep only chocolate-coded foods
  group_by(SEQN) %>%                          # Group by participant ID
  summarise(choc_grams = sum(DR1IGRMS, na.rm = TRUE), .groups = "drop") %>% 
  mutate(chocolate = 1)                       # Mark participant as chocolate consumer

#MERGE NHANES TABLES INTO ONE ANALYSIS DATASET

final_data <- DEMO_J %>%
  inner_join(BPX_J, by = "SEQN") %>%          # Merge demographics + blood pressure
  inner_join(DR1TOT_J, by = "SEQN") %>%       # Merge total nutrition intake
  left_join(chocolate_data, by = "SEQN")      # Add chocolate indicator and grams

#REPLACE MISSING CHOCOLATE VALUES

final_data <- final_data %>%
  mutate(
    chocolate  = ifelse(is.na(chocolate), 0, chocolate),    # 0 = no chocolate food found
    choc_grams = ifelse(is.na(choc_grams), 0, choc_grams)   # 0 grams if no chocolate intake
  )

#CREATE AVERAGE SYSTOLIC BLOOD PRESSURE

final_data <- final_data %>%
  mutate(
    BPXSY1 = as.numeric(BPXSY1),   # First systolic reading
    BPXSY2 = as.numeric(BPXSY2),   # Second systolic reading
    BPXSY3 = as.numeric(BPXSY3),   # Third systolic reading
    BPXSY4 = as.numeric(BPXSY4)    # Fourth systolic reading
  ) %>%
  mutate(
    avg_systolic = rowMeans(
      select(., BPXSY1, BPXSY2, BPXSY3, BPXSY4), 
      na.rm = TRUE
    )
  ) %>%
  mutate(
    avg_systolic = ifelse(is.nan(avg_systolic), NA, avg_systolic) 
    # If all BP readings were missing, rowMeans gives NaN, so convert to NA
  )

#PREPARE REGRESSION DATASET

regression_data <- final_data %>%
  select(
    avg_systolic,   # Outcome variable: average systolic BP
    chocolate,      # Chocolate intake group: 0 = No, 1 = Yes
    choc_grams,     # Chocolate/cocoa grams
    RIDAGEYR,       # Age
    RIAGENDR,       # Gender
    DR1TKCAL,       # Total calories
    DR1TSUGR,       # Total sugar
    DR1TTFAT,       # Total fat
    DR1TPROT,       # Total protein
    DR1TCARB        # Total carbohydrates
  ) %>%
  filter(
    !is.na(avg_systolic)   # Remove rows without systolic BP
  ) %>%
  mutate(
    chocolate = as.factor(chocolate),     # Convert chocolate group to factor
    RIAGENDR = as.factor(RIAGENDR),       # Convert gender to factor
    choc_grams = as.numeric(choc_grams),  # Ensure chocolate grams is numeric
    RIDAGEYR = as.numeric(RIDAGEYR),      # Ensure age is numeric
    DR1TKCAL = as.numeric(DR1TKCAL),      # Ensure calories are numeric
    DR1TSUGR = as.numeric(DR1TSUGR),      # Ensure sugar is numeric
    DR1TTFAT = as.numeric(DR1TTFAT),      # Ensure fat is numeric
    DR1TPROT = as.numeric(DR1TPROT),      # Ensure protein is numeric
    DR1TCARB = as.numeric(DR1TCARB)       # Ensure carbohydrates are numeric
  ) %>%
  na.omit()   # Remove remaining missing values


# Check final sample size
dim(regression_data)
## [1] 6125   10

#TRAIN / TEST SPLIT

set.seed(123)  # Reproducibility

train_index <- createDataPartition(
  regression_data$avg_systolic,  # Outcome variable
  p = 0.80,                      # 80% training, 20% testing
  list = FALSE
)

train_data <- regression_data[train_index, ]   # Training data
test_data  <- regression_data[-train_index, ]  # Testing data

# Confirm sample size
nrow(train_data)
## [1] 4902
nrow(test_data)
## [1] 1223

#Build Model 1: Linear Regression

lm_model <- lm(
  avg_systolic ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR +
    DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
  data = train_data
)

# View coefficients, p-values, and R-squared
summary(lm_model)
## 
## Call:
## lm(formula = avg_systolic ~ chocolate + choc_grams + RIDAGEYR + 
##     RIAGENDR + DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB, 
##     data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.801 -10.019  -1.421   8.012 105.319 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    99.772446   0.807594 123.543  < 2e-16 ***
## chocolate1      3.962369   5.126977   0.773 0.439650    
## choc_grams     -0.047680   0.078513  -0.607 0.543687    
## RIDAGEYR        0.543021   0.010124  53.639  < 2e-16 ***
## RIAGENDRFemale -1.679276   0.473948  -3.543 0.000399 ***
## DR1TKCAL        0.006243   0.001391   4.489 7.33e-06 ***
## DR1TSUGR       -0.002533   0.005791  -0.437 0.661882    
## DR1TTFAT       -0.050211   0.014317  -3.507 0.000457 ***
## DR1TPROT       -0.034535   0.010566  -3.269 0.001089 ** 
## DR1TCARB       -0.022575   0.006869  -3.286 0.001022 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.76 on 4892 degrees of freedom
## Multiple R-squared:  0.3783, Adjusted R-squared:  0.3772 
## F-statistic: 330.8 on 9 and 4892 DF,  p-value: < 2.2e-16
# Predict on test data
lm_pred <- predict(
  lm_model,
  newdata = test_data
)

#Build Model 2: Random Forest Regression

set.seed(123)

rf_model <- randomForest(
  avg_systolic ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR +
    DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
  data = train_data,
  ntree = 500,        # Number of trees
  importance = TRUE   # Calculate variable importance
)

# View Random Forest summary
print(rf_model)
## 
## Call:
##  randomForest(formula = avg_systolic ~ chocolate + choc_grams +      RIDAGEYR + RIAGENDR + DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT +      DR1TCARB, data = train_data, ntree = 500, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 255.0425
##                     % Var explained: 36.07
# Predict on test data
rf_pred <- predict(
  rf_model,
  newdata = test_data
)

# Variable importance
importance(rf_model)
##              %IncMSE IncNodePurity
## chocolate   -1.08349      2904.220
## choc_grams   1.19876      5654.535
## RIDAGEYR   221.99465    826238.080
## RIAGENDR    18.57220     31681.175
## DR1TKCAL    30.09189    179466.588
## DR1TSUGR    24.72750    194394.499
## DR1TTFAT    27.48672    179100.914
## DR1TPROT    25.53958    194466.940
## DR1TCARB    26.14524    181115.260
# Plot variable importance
varImpPlot(
  rf_model,
  main = "Random Forest Variable Importance"
)

#Build Model 3: Gradient Boosting Regression

# install.packages("gbm")
library(gbm) 
## Warning: package 'gbm' was built under R version 4.5.3
## Loaded gbm 2.2.3
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
set.seed(123)

gbm_model <- gbm(
  avg_systolic ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR +
    DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
  data = train_data,
  distribution = "gaussian",   # Regression outcome
  n.trees = 1000,              # Number of boosting trees
  interaction.depth = 3,       # Tree depth
  shrinkage = 0.01,            # Learning rate
  n.minobsinnode = 10,         # Minimum observations per terminal node
  cv.folds = 5,                # Cross-validation
  verbose = FALSE
)

# Find best number of trees
best_trees <- gbm.perf(
  gbm_model,
  method = "cv"
)

# Predict on test data
gbm_pred <- predict(
  gbm_model,
  newdata = test_data,
  n.trees = best_trees
)

# Variable importance
summary(
  gbm_model,
  n.trees = best_trees
)

##                   var      rel.inf
## RIDAGEYR     RIDAGEYR 90.865076619
## RIAGENDR     RIAGENDR  2.627306075
## DR1TPROT     DR1TPROT  2.228053198
## DR1TSUGR     DR1TSUGR  1.764978062
## DR1TTFAT     DR1TTFAT  0.906583248
## DR1TCARB     DR1TCARB  0.813591039
## DR1TKCAL     DR1TKCAL  0.771105903
## choc_grams choc_grams  0.015793423
## chocolate   chocolate  0.007512434

#Define Evaluation Metrics

# 6. CREATE EVALUATION FUNCTIONS

rmse_fun <- function(actual, predicted) {
  sqrt(mean((actual - predicted)^2))
}

mae_fun <- function(actual, predicted) {
  mean(abs(actual - predicted))
}

r2_fun <- function(actual, predicted) {
  cor(actual, predicted)^2
}

#Compare All Three Models

model_comparison <- data.frame(
  Model = c(
    "Linear Regression",
    "Random Forest Regression",
    "Gradient Boosting Regression"
  ),
  
  RMSE = c(
    rmse_fun(test_data$avg_systolic, lm_pred),
    rmse_fun(test_data$avg_systolic, rf_pred),
    rmse_fun(test_data$avg_systolic, gbm_pred)
  ),
  
  MAE = c(
    mae_fun(test_data$avg_systolic, lm_pred),
    mae_fun(test_data$avg_systolic, rf_pred),
    mae_fun(test_data$avg_systolic, gbm_pred)
  ),
  
  R_squared = c(
    r2_fun(test_data$avg_systolic, lm_pred),
    r2_fun(test_data$avg_systolic, rf_pred),
    r2_fun(test_data$avg_systolic, gbm_pred)
  )
)

# Print comparison table
print(model_comparison)
##                          Model     RMSE      MAE R_squared
## 1            Linear Regression 16.38001 12.08913 0.3343688
## 2     Random Forest Regression 16.50058 12.23541 0.3257989
## 3 Gradient Boosting Regression 16.20762 11.98067 0.3475444

#Choose Best Model Automatically

best_model_rmse <- model_comparison %>%
  arrange(RMSE) %>%
  slice(1)

print(best_model_rmse)
##                          Model     RMSE      MAE R_squared
## 1 Gradient Boosting Regression 16.20762 11.98067 0.3475444

Lower RMSE = better model Lower MAE = better model Higher R_squared = better model

#Bar Chart: RMSE Comparison

ggplot(model_comparison, aes(x = Model, y = RMSE, fill = Model)) +
  geom_col(width = 0.7) +
  geom_text(aes(label = round(RMSE, 2)), vjust = -0.5, size = 4) +
  labs(
    title = "Comparison of Regression Models Using RMSE",
    x = "Machine Learning Model",
    y = "RMSE"
  ) +
  scale_fill_brewer(palette = "Set2") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 25, hjust = 1),
    legend.position = "none"
  )

#Bar Chart: R-Squared Comparison

# =========================================================
# 10. VISUALIZE R-SQUARED COMPARISON
# =========================================================

ggplot(model_comparison, aes(x = Model, y = R_squared, fill = Model)) +
  geom_col(width = 0.7) +
  geom_text(aes(label = round(R_squared, 3)), vjust = -0.5, size = 4) +
  labs(
    title = "Comparison of Regression Models Using R-Squared",
    x = "Machine Learning Model",
    y = "R-Squared"
  ) +
  scale_fill_brewer(palette = "Set2") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 25, hjust = 1),
    legend.position = "none"
  )

#Actual vs Predicted value Plots

prediction_results <- data.frame(
  Actual = test_data$avg_systolic,
  Linear_Regression = lm_pred,
  Random_Forest = rf_pred,
  Gradient_Boosting = gbm_pred
)

head(prediction_results)
##      Actual Linear_Regression Random_Forest Gradient_Boosting
## 10 114.0000          138.3747      142.6584          135.2758
## 14 101.3333          104.9084      101.2164          105.3998
## 20 124.0000          131.0123      129.8408          129.0020
## 27 109.3333          105.6999      103.2622          105.4167
## 38 116.0000          138.1187      132.8147          134.3736
## 39 104.0000          112.2495      110.0748          112.0120

Conclusion:- This table shows that the models are able to predict many systolic BP values near the actual values, especially in the normal or moderate range. However, for very high blood pressure values, such as 162, all models underestimated the actual BP. This suggests that the models may need stronger predictors, more data, or additional clinical variables to better capture high blood pressure cases.

prediction_compare <- data.frame(
  Actual = test_data$avg_systolic,
  Linear_Regression = lm_pred,
  Random_Forest = rf_pred,
  Gradient_Boosting = gbm_pred
)

library(dplyr)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.3
library(ggplot2)

prediction_long <- prediction_compare %>%
  pivot_longer(
    cols = c(Linear_Regression, Random_Forest, Gradient_Boosting),
    names_to = "Model",
    values_to = "Predicted"
  )

prediction_sample <- prediction_compare %>%
  slice(1:20) %>%
  mutate(Row = row_number())

prediction_sample_long <- prediction_sample %>%
  pivot_longer(
    cols = c(Actual, Linear_Regression, Random_Forest, Gradient_Boosting),
    names_to = "Type",
    values_to = "BP_Value"
  )

ggplot(prediction_sample_long, aes(x = Row, y = BP_Value, color = Type)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  labs(
    title = "Actual vs Predicted Systolic BP for Sample Test Observations",
    subtitle = "Comparison across Linear Regression, Random Forest, and Gradient Boosting",
    x = "Test Data Row",
    y = "Average Systolic BP",
    color = "Actual / Predicted"
  ) +
  theme_minimal()

This graph is clearly shows how close each model prediction is to the actual value.

#Add Error Column for Model Comparison
prediction_error <- prediction_compare %>%
  mutate(
    Error_Linear_Regression = abs(Actual - Linear_Regression),
    Error_Random_Forest = abs(Actual - Random_Forest),
    Error_Gradient_Boosting = abs(Actual - Gradient_Boosting)
  )

head(prediction_error)
##      Actual Linear_Regression Random_Forest Gradient_Boosting
## 10 114.0000          138.3747      142.6584          135.2758
## 14 101.3333          104.9084      101.2164          105.3998
## 20 124.0000          131.0123      129.8408          129.0020
## 27 109.3333          105.6999      103.2622          105.4167
## 38 116.0000          138.1187      132.8147          134.3736
## 39 104.0000          112.2495      110.0748          112.0120
##    Error_Linear_Regression Error_Random_Forest Error_Gradient_Boosting
## 10               24.374728          28.6584177               21.275770
## 14                3.575082           0.1169215                4.066419
## 20                7.012342           5.8408382                5.001980
## 27                3.633438           6.0711474                3.916661
## 38               22.118657          16.8147292               18.373597
## 39                8.249469           6.0748453                8.012028
#Error Comparison Boxplot
error_long <- prediction_error %>%
  select(
    Error_Linear_Regression,
    Error_Random_Forest,
    Error_Gradient_Boosting
  ) %>%
  pivot_longer(
    cols = everything(),
    names_to = "Model",
    values_to = "Absolute_Error"
  )

ggplot(error_long, aes(x = Model, y = Absolute_Error, fill = Model)) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title = "Prediction Error Comparison Across Models",
    subtitle = "Lower absolute error indicates better model performance",
    x = "Model",
    y = "Absolute Prediction Error"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

#========================================================= #AVERAGE BLOOD PRESSURE REDUCTION GRAPH #Moderate Chocolate/Cocoa Intake vs No Intake #=========================================================

library(dplyr)
library(tidyr)
library(ggplot2)
library(dplyr)
library(tidyr)
library(ggplot2)

# =========================================================
# Create analysis_data from final_data
# =========================================================

analysis_data <- final_data %>%
  mutate(
    BPXSY1 = as.numeric(BPXSY1),
    BPXSY2 = as.numeric(BPXSY2),
    BPXSY3 = as.numeric(BPXSY3),
    BPXSY4 = as.numeric(BPXSY4),
    
    BPXDI1 = as.numeric(BPXDI1),
    BPXDI2 = as.numeric(BPXDI2),
    BPXDI3 = as.numeric(BPXDI3),
    BPXDI4 = as.numeric(BPXDI4),
    
    avg_systolic = rowMeans(
      select(., BPXSY1, BPXSY2, BPXSY3, BPXSY4),
      na.rm = TRUE
    ),
    
    avg_diastolic = rowMeans(
      select(., BPXDI1, BPXDI2, BPXDI3, BPXDI4),
      na.rm = TRUE
    ),
    
    avg_systolic = ifelse(is.nan(avg_systolic), NA, avg_systolic),
    avg_diastolic = ifelse(is.nan(avg_diastolic), NA, avg_diastolic),
    
    chocolate_level = case_when(
      choc_grams == 0 ~ "None",
      choc_grams > 0 & choc_grams <= 25 ~ "Low",
      choc_grams > 25 & choc_grams <= 50 ~ "Moderate",
      choc_grams > 50 ~ "High",
      TRUE ~ NA_character_
    )
  )

# =========================================================
# Average BP Reduction: None vs Moderate Chocolate/Cocoa Intake
# =========================================================

# 1. Keep only None and Moderate groups
bp_reduction_data <- analysis_data %>%
  filter(chocolate_level %in% c("None", "Moderate")) %>%
  select(chocolate_level, avg_systolic, avg_diastolic)

# Check counts before plotting
table(bp_reduction_data$chocolate_level, useNA = "ifany")
## 
## Moderate     None 
##       15     8654
bp_long <- bp_reduction_data %>%
  pivot_longer(
    cols = c(avg_systolic, avg_diastolic),
    names_to = "BP_Type",
    values_to = "BP_Value"
  ) %>%
  filter(!is.na(BP_Value)) %>%
  mutate(
    BP_Type = case_when(
      BP_Type == "avg_systolic" ~ "Systolic BP",
      BP_Type == "avg_diastolic" ~ "Diastolic BP"
    )
  )

# Check how many valid BP values are available
bp_long %>%
  count(chocolate_level, BP_Type)
## # A tibble: 4 × 3
##   chocolate_level BP_Type          n
##   <chr>           <chr>        <int>
## 1 Moderate        Diastolic BP    10
## 2 Moderate        Systolic BP     10
## 3 None            Diastolic BP  6678
## 4 None            Systolic BP   6678
bp_summary <- bp_long %>%
  group_by(chocolate_level, BP_Type) %>%
  summarise(
    mean_bp = mean(BP_Value, na.rm = TRUE),
    sd_bp = sd(BP_Value, na.rm = TRUE),
    n = n(),
    se = ifelse(n > 1, sd_bp / sqrt(n), 0),
    .groups = "drop"
  )

bp_summary
## # A tibble: 4 × 6
##   chocolate_level BP_Type      mean_bp sd_bp     n    se
##   <chr>           <chr>          <dbl> <dbl> <int> <dbl>
## 1 Moderate        Diastolic BP    75.9  6.72    10 2.12 
## 2 Moderate        Systolic BP    134.  23.6     10 7.47 
## 3 None            Diastolic BP    68.3 15.8   6678 0.193
## 4 None            Systolic BP    122.  20.3   6678 0.249
bp_reduction_summary <- bp_summary %>%
  select(BP_Type, chocolate_level, mean_bp, se, n) %>%
  pivot_wider(
    names_from = chocolate_level,
    values_from = c(mean_bp, se, n),
    names_glue = "{.value}_{chocolate_level}"
  ) %>%
  mutate(
    Reduction = mean_bp_None - mean_bp_Moderate,
    Error = sqrt(se_None^2 + se_Moderate^2)
  )

bp_reduction_summary
## # A tibble: 2 × 9
##   BP_Type    mean_bp_Moderate mean_bp_None se_Moderate se_None n_Moderate n_None
##   <chr>                 <dbl>        <dbl>       <dbl>   <dbl>      <int>  <int>
## 1 Diastolic…             75.9         68.3        2.12   0.193         10   6678
## 2 Systolic …            134.         122.         7.47   0.249         10   6678
## # ℹ 2 more variables: Reduction <dbl>, Error <dbl>
ggplot(bp_reduction_summary, aes(x = BP_Type, y = Reduction, fill = BP_Type)) +
  geom_col(width = 0.65, alpha = 0.85) +
  geom_errorbar(
    aes(
      ymin = Reduction - Error,
      ymax = Reduction + Error
    ),
    width = 0.15,
    linewidth = 0.8,
    color = "black"
  ) +
  geom_text(
    aes(label = round(Reduction, 2)),
    vjust = ifelse(bp_reduction_summary$Reduction >= 0, -0.6, 1.3),
    size = 5
  ) +
  labs(
    title = "Average Blood Pressure Difference (Moderate Cocoa Intake)",
    subtitle = "Calculated as No Intake group mean minus Moderate Intake group mean",
    x = "",
    y = "Difference in Blood Pressure (mmHg)"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "none",
    plot.title = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5),
    axis.text.x = element_text(face = "bold"),
    panel.grid.minor = element_blank()
  )

based on NHANES data, the Moderate chocolate/cocoa group has higher average BP than the None group, not lower BP.

In this dataset, the moderate chocolate/cocoa intake group had higher average systolic and diastolic blood pressure compared with the no-intake group. Therefore, the calculated reduction values are negative. This means the data does not show an average blood pressure reduction for the moderate intake group. However, the moderate group has only 10 complete BP observations compared with 6,678 observations in the no-intake group, so this result should be interpreted carefully as an exploratory observational finding.

#ROC Curve and AUC for All Models # ========================================================= # ROC CURVE AND AUC COMPARISON FOR ALL MODELS # Models: Linear Regression, Random Forest, Gradient Boosting # Outcome converted to binary: High BP vs Normal BP # =========================================================

# Install only once if needed
# install.packages("pROC")

library(pROC)
library(dplyr)
library(ggplot2)
library(knitr)

# -------------------------------
# 1. Create actual High BP class
# -------------------------------

actual_class <- ifelse(test_data$avg_systolic >= 130, 1, 0)

# 1 = High BP
# 0 = Normal BP

# -------------------------------
# 2. Create ROC objects
# -------------------------------

roc_lm <- roc(
  response = actual_class,
  predictor = lm_pred
)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_rf <- roc(
  response = actual_class,
  predictor = rf_pred
)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_gbm <- roc(
  response = actual_class,
  predictor = gbm_pred
)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# -------------------------------
# 3. Calculate AUC values
# -------------------------------

auc_lm <- auc(roc_lm)
auc_rf <- auc(roc_rf)
auc_gbm <- auc(roc_gbm)

auc_lm
## Area under the curve: 0.8166
auc_rf
## Area under the curve: 0.8043
auc_gbm
## Area under the curve: 0.8205

#AUC Comparison Table

auc_table <- data.frame(
  Model = c(
    "Linear Regression",
    "Random Forest Regression",
    "Gradient Boosting Regression"
  ),
  
  AUC = c(
    as.numeric(auc_lm),
    as.numeric(auc_rf),
    as.numeric(auc_gbm)
  )
)

auc_table <- auc_table %>%
  mutate(AUC = round(AUC, 3)) %>%
  arrange(desc(AUC))

kable(
  auc_table,
  caption = "AUC Comparison of Machine Learning Models",
  align = "c"
)
AUC Comparison of Machine Learning Models
Model AUC
Gradient Boosting Regression 0.820
Linear Regression 0.817
Random Forest Regression 0.804

#Plot ROC Curves Together

plot(
  roc_lm,
  col = "blue",
  lwd = 2,
  main = "ROC Curve Comparison for High Blood Pressure Prediction"
)

plot(
  roc_rf,
  col = "red",
  lwd = 2,
  add = TRUE
)

plot(
  roc_gbm,
  col = "darkgreen",
  lwd = 2,
  add = TRUE
)

legend(
  "bottomright",
  legend = c(
    paste("Linear Regression AUC =", round(auc_lm, 3)),
    paste("Random Forest AUC =", round(auc_rf, 3)),
    paste("Gradient Boosting AUC =", round(auc_gbm, 3))
  ),
  col = c("blue", "red", "darkgreen"),
  lwd = 2
)

# Convert ROC objects into data frames

roc_lm_df <- data.frame(
  Specificity = roc_lm$specificities,
  Sensitivity = roc_lm$sensitivities,
  Model = "Linear Regression"
)

roc_rf_df <- data.frame(
  Specificity = roc_rf$specificities,
  Sensitivity = roc_rf$sensitivities,
  Model = "Random Forest Regression"
)

roc_gbm_df <- data.frame(
  Specificity = roc_gbm$specificities,
  Sensitivity = roc_gbm$sensitivities,
  Model = "Gradient Boosting Regression"
)

roc_all_df <- rbind(roc_lm_df, roc_rf_df, roc_gbm_df)

ggplot(roc_all_df, aes(x = 1 - Specificity, y = Sensitivity, color = Model)) +
  geom_line(size = 1.0) +
  geom_abline(
    intercept = 0,
    slope = 1,
    linetype = "dashed",
    color = "gray"
  ) +
  labs(
    title = "ROC Curve Comparison of Machine Learning Models",
    subtitle = "High BP defined as average systolic blood pressure ≥ 130 mmHg",
    x = "False Positive Rate (1 - Specificity)",
    y = "True Positive Rate (Sensitivity)",
    color = "Model"
  ) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#========================================================= #FINAL MACHINE LEARNING MODEL COMPARISON TABLE #Regression Metrics + Classification Metrics #Outcome: avg_systolic #Classification threshold: High BP if systolic BP >= 130 #=========================================================

library(dplyr)
library(knitr)

# -------------------------------
# 1. Regression Metric Functions
# -------------------------------

rmse_fun <- function(actual, predicted) {
  sqrt(mean((actual - predicted)^2, na.rm = TRUE))
}

mae_fun <- function(actual, predicted) {
  mean(abs(actual - predicted), na.rm = TRUE)
}

r2_fun <- function(actual, predicted) {
  cor(actual, predicted, use = "complete.obs")^2
}

# -------------------------------
# 2. Classification Metric Function
# -------------------------------

class_metrics_fun <- function(actual_bp, predicted_bp, threshold = 130) {
  
  actual_class <- ifelse(actual_bp >= threshold, "High BP", "Normal BP")
  predicted_class <- ifelse(predicted_bp >= threshold, "High BP", "Normal BP")
  
  actual_class <- factor(actual_class, levels = c("High BP", "Normal BP"))
  predicted_class <- factor(predicted_class, levels = c("High BP", "Normal BP"))
  
  cm <- table(Predicted = predicted_class, Actual = actual_class)
  
  TP <- cm["High BP", "High BP"]
  TN <- cm["Normal BP", "Normal BP"]
  FP <- cm["High BP", "Normal BP"]
  FN <- cm["Normal BP", "High BP"]
  
  accuracy <- (TP + TN) / sum(cm)
  precision <- TP / (TP + FP)
  recall <- TP / (TP + FN)
  specificity <- TN / (TN + FP)
  f1_score <- 2 * ((precision * recall) / (precision + recall))
  
  return(
    c(
      Accuracy = accuracy,
      Precision = precision,
      Recall = recall,
      Specificity = specificity,
      F1_Score = f1_score
    )
  )
}

# -------------------------------
# 3. Calculate Classification Metrics
# -------------------------------

lm_class_metrics <- class_metrics_fun(test_data$avg_systolic, lm_pred)
rf_class_metrics <- class_metrics_fun(test_data$avg_systolic, rf_pred)
gbm_class_metrics <- class_metrics_fun(test_data$avg_systolic, gbm_pred)

# -------------------------------
# 4. Create Final Comparison Table
# -------------------------------

model_comparison_final <- data.frame(
  Model = c(
    "Linear Regression",
    "Random Forest Regression",
    "Gradient Boosting Regression"
  ),
  
  RMSE = c(
    rmse_fun(test_data$avg_systolic, lm_pred),
    rmse_fun(test_data$avg_systolic, rf_pred),
    rmse_fun(test_data$avg_systolic, gbm_pred)
  ),
  
  MAE = c(
    mae_fun(test_data$avg_systolic, lm_pred),
    mae_fun(test_data$avg_systolic, rf_pred),
    mae_fun(test_data$avg_systolic, gbm_pred)
  ),
  
  R_Squared = c(
    r2_fun(test_data$avg_systolic, lm_pred),
    r2_fun(test_data$avg_systolic, rf_pred),
    r2_fun(test_data$avg_systolic, gbm_pred)
  ),
  
  Accuracy = c(
    lm_class_metrics["Accuracy"],
    rf_class_metrics["Accuracy"],
    gbm_class_metrics["Accuracy"]
  ),
  
  Precision = c(
    lm_class_metrics["Precision"],
    rf_class_metrics["Precision"],
    gbm_class_metrics["Precision"]
  ),
  
  Recall = c(
    lm_class_metrics["Recall"],
    rf_class_metrics["Recall"],
    gbm_class_metrics["Recall"]
  ),
  
  Specificity = c(
    lm_class_metrics["Specificity"],
    rf_class_metrics["Specificity"],
    gbm_class_metrics["Specificity"]
  ),
  
  F1_Score = c(
    lm_class_metrics["F1_Score"],
    rf_class_metrics["F1_Score"],
    gbm_class_metrics["F1_Score"]
  )
)

# Round values for clean presentation
model_comparison_final <- model_comparison_final %>%
  mutate(across(where(is.numeric), ~ round(.x, 3)))

# Print nice statistical table
kable(
  model_comparison_final,
  caption = "Machine Learning Model Comparison for Predicting Average Systolic Blood Pressure",
  align = "c"
)
Machine Learning Model Comparison for Predicting Average Systolic Blood Pressure
Model RMSE MAE R_Squared Accuracy Precision Recall Specificity F1_Score
Linear Regression 16.380 12.089 0.334 0.742 0.529 0.622 0.787 0.572
Random Forest Regression 16.501 12.235 0.326 0.738 0.522 0.617 0.784 0.566
Gradient Boosting Regression 16.208 11.981 0.348 0.745 0.531 0.681 0.769 0.597

Interpetation :-

To compare the machine learning pipelines, the same cleaned NHANES dataset was used for all models. The outcome variable was average systolic blood pressure, and the predictor variables included chocolate/cocoa intake, chocolate grams, age, gender, total calories, sugar, fat, protein, and carbohydrate intake. The dataset was split into 80% training and 20% testing data. Three regression models were developed: Linear Regression, Random Forest Regression, and Gradient Boosting Regression. Model performance was evaluated using RMSE, MAE, and R-squared. RMSE and MAE measured prediction error, while R-squared measured the proportion of variation in systolic blood pressure explained by the model.

The best-performing model was selected based on the lowest RMSE and MAE and the highest R-squared. Linear Regression was useful for interpretation and identifying statistically significant predictors. Random Forest Regression and Gradient Boosting Regression were useful for capturing non-linear relationships between dietary intake, demographic factors, and systolic blood pressure.

The machine learning pipelines were compared using identical training and testing datasets to ensure fairness. Linear Regression served as the baseline model, while Random Forest Regression and Gradient Boosting Regression were used to evaluate whether non-linear machine learning methods improved prediction of average systolic blood pressure. Performance was compared using RMSE, MAE, and R-squared. The model with the lowest RMSE and MAE and highest R-squared was considered the best predictive model. Variable importance from Random Forest and Gradient Boosting was also examined to determine whether chocolate/cocoa intake contributed meaningfully compared with stronger predictors such as age, gender, and total dietary intake.

COSMOS by saying that clinical trial evidence found cocoa extract did not significantly reduce total cardiovascular events, although secondary analyses suggested reduced CVD death; therefore, your NHANES ML analysis should be interpreted as an exploratory prediction study rather than causal evidence.