#========================================================= #COMPARE MACHINE LEARNING PIPELINES #Outcome: Average systolic blood pressure #input var: #avg_systolic, # Outcome variable: average systolic BP #chocolate, # Chocolate intake group: 0 = No, 1 = Yes #choc_grams, # Chocolate/cocoa grams #RIDAGEYR, # Age #RIAGENDR, # Gender #DR1TKCAL, # Total calories #DR1TSUGR, # Total sugar #DR1TTFAT, # Total fat #DR1TPROT, # Total protein #DR1TCARB # Total carbohydrates #output var: #Models: #1. Linear Regression #2. Random Forest Regression #3. Gradient Boosting Regression #=========================================================
library(httr)
library(haven)
## Warning: package 'haven' was built under R version 4.5.3
url <- "https://wwwn.cdc.gov/Nchs/Nhanes/2017-20121/DEMO_J.XPT"
dest <- file.path(tempdir(), "DEMO_J.XPT")
res <- GET(
url,
user_agent("R"),
write_disk(dest, overwrite = TRUE)
)
print(file.info(dest)$size)
## [1] 20905
#LOAD REQUIRED LIBRARIES
library(dplyr) # Used for data cleaning, grouping, and summarizing
## Warning: package 'dplyr' was built under R version 4.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) # Used for creating visualizations
## Warning: package 'ggplot2' was built under R version 4.5.3
library(psych) # Used for descriptive statistics
## Warning: package 'psych' was built under R version 4.5.3
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(broom) # Used to convert model results into clean tables
## Warning: package 'broom' was built under R version 4.5.3
library(nhanesA) # Download NHANES datasets directly in R
## Warning: package 'nhanesA' was built under R version 4.5.3
library(dplyr) # Data manipulation
library(caret) # Train/test split + preprocessing + ML utilities
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:httr':
##
## progress
library(randomForest) # Random Forest model
## Warning: package 'randomForest' was built under R version 4.5.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(pROC) # ROC and AUC evaluation
## Warning: package 'pROC' was built under R version 4.5.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
#2. DOWNLOAD NHANES DATA (REAL DATA) Datasets
DEMO_J <- nhanes("DEMO_J") # Demographic data
BPX_J <- nhanes("BPX_J") # Blood pressure examination data
DR1TOT_J <- nhanes("DR1TOT_J") # Day 1 total nutrient intake
DR1IFF_J <- nhanes("DR1IFF_J") # Day 1 individual foods file
dim(DR1IFF_J)
## [1] 112683 84
#CREATE CHOCOLATE INDICATOR
chocolate_codes <- c(
53111000, 53113000, 53114000, 53201000, 53202000,
53203000, 53233060, 54101000, 54102000, 92530010
)
# Filter food-level data to rows representing chocolate-related foods
chocolate_data <- DR1IFF_J %>%
filter(DR1IFDCD %in% chocolate_codes) %>% # Keep only chocolate-coded foods
group_by(SEQN) %>% # Group by participant ID
summarise(choc_grams = sum(DR1IGRMS, na.rm = TRUE), .groups = "drop") %>%
mutate(chocolate = 1) # Mark participant as chocolate consumer
#MERGE NHANES TABLES INTO ONE ANALYSIS DATASET
final_data <- DEMO_J %>%
inner_join(BPX_J, by = "SEQN") %>% # Merge demographics + blood pressure
inner_join(DR1TOT_J, by = "SEQN") %>% # Merge total nutrition intake
left_join(chocolate_data, by = "SEQN") # Add chocolate indicator and grams
#REPLACE MISSING CHOCOLATE VALUES
final_data <- final_data %>%
mutate(
chocolate = ifelse(is.na(chocolate), 0, chocolate), # 0 = no chocolate food found
choc_grams = ifelse(is.na(choc_grams), 0, choc_grams) # 0 grams if no chocolate intake
)
#CREATE AVERAGE SYSTOLIC BLOOD PRESSURE
final_data <- final_data %>%
mutate(
BPXSY1 = as.numeric(BPXSY1), # First systolic reading
BPXSY2 = as.numeric(BPXSY2), # Second systolic reading
BPXSY3 = as.numeric(BPXSY3), # Third systolic reading
BPXSY4 = as.numeric(BPXSY4) # Fourth systolic reading
) %>%
mutate(
avg_systolic = rowMeans(
select(., BPXSY1, BPXSY2, BPXSY3, BPXSY4),
na.rm = TRUE
)
) %>%
mutate(
avg_systolic = ifelse(is.nan(avg_systolic), NA, avg_systolic)
# If all BP readings were missing, rowMeans gives NaN, so convert to NA
)
#PREPARE REGRESSION DATASET
regression_data <- final_data %>%
select(
avg_systolic, # Outcome variable: average systolic BP
chocolate, # Chocolate intake group: 0 = No, 1 = Yes
choc_grams, # Chocolate/cocoa grams
RIDAGEYR, # Age
RIAGENDR, # Gender
DR1TKCAL, # Total calories
DR1TSUGR, # Total sugar
DR1TTFAT, # Total fat
DR1TPROT, # Total protein
DR1TCARB # Total carbohydrates
) %>%
filter(
!is.na(avg_systolic) # Remove rows without systolic BP
) %>%
mutate(
chocolate = as.factor(chocolate), # Convert chocolate group to factor
RIAGENDR = as.factor(RIAGENDR), # Convert gender to factor
choc_grams = as.numeric(choc_grams), # Ensure chocolate grams is numeric
RIDAGEYR = as.numeric(RIDAGEYR), # Ensure age is numeric
DR1TKCAL = as.numeric(DR1TKCAL), # Ensure calories are numeric
DR1TSUGR = as.numeric(DR1TSUGR), # Ensure sugar is numeric
DR1TTFAT = as.numeric(DR1TTFAT), # Ensure fat is numeric
DR1TPROT = as.numeric(DR1TPROT), # Ensure protein is numeric
DR1TCARB = as.numeric(DR1TCARB) # Ensure carbohydrates are numeric
) %>%
na.omit() # Remove remaining missing values
# Check final sample size
dim(regression_data)
## [1] 6125 10
#TRAIN / TEST SPLIT
set.seed(123) # Reproducibility
train_index <- createDataPartition(
regression_data$avg_systolic, # Outcome variable
p = 0.80, # 80% training, 20% testing
list = FALSE
)
train_data <- regression_data[train_index, ] # Training data
test_data <- regression_data[-train_index, ] # Testing data
# Confirm sample size
nrow(train_data)
## [1] 4902
nrow(test_data)
## [1] 1223
#Build Model 1: Linear Regression
lm_model <- lm(
avg_systolic ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR +
DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
data = train_data
)
# View coefficients, p-values, and R-squared
summary(lm_model)
##
## Call:
## lm(formula = avg_systolic ~ chocolate + choc_grams + RIDAGEYR +
## RIAGENDR + DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
## data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.801 -10.019 -1.421 8.012 105.319
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99.772446 0.807594 123.543 < 2e-16 ***
## chocolate1 3.962369 5.126977 0.773 0.439650
## choc_grams -0.047680 0.078513 -0.607 0.543687
## RIDAGEYR 0.543021 0.010124 53.639 < 2e-16 ***
## RIAGENDRFemale -1.679276 0.473948 -3.543 0.000399 ***
## DR1TKCAL 0.006243 0.001391 4.489 7.33e-06 ***
## DR1TSUGR -0.002533 0.005791 -0.437 0.661882
## DR1TTFAT -0.050211 0.014317 -3.507 0.000457 ***
## DR1TPROT -0.034535 0.010566 -3.269 0.001089 **
## DR1TCARB -0.022575 0.006869 -3.286 0.001022 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.76 on 4892 degrees of freedom
## Multiple R-squared: 0.3783, Adjusted R-squared: 0.3772
## F-statistic: 330.8 on 9 and 4892 DF, p-value: < 2.2e-16
# Predict on test data
lm_pred <- predict(
lm_model,
newdata = test_data
)
#Build Model 2: Random Forest Regression
set.seed(123)
rf_model <- randomForest(
avg_systolic ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR +
DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
data = train_data,
ntree = 500, # Number of trees
importance = TRUE # Calculate variable importance
)
# View Random Forest summary
print(rf_model)
##
## Call:
## randomForest(formula = avg_systolic ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR + DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB, data = train_data, ntree = 500, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 255.0425
## % Var explained: 36.07
# Predict on test data
rf_pred <- predict(
rf_model,
newdata = test_data
)
# Variable importance
importance(rf_model)
## %IncMSE IncNodePurity
## chocolate -1.08349 2904.220
## choc_grams 1.19876 5654.535
## RIDAGEYR 221.99465 826238.080
## RIAGENDR 18.57220 31681.175
## DR1TKCAL 30.09189 179466.588
## DR1TSUGR 24.72750 194394.499
## DR1TTFAT 27.48672 179100.914
## DR1TPROT 25.53958 194466.940
## DR1TCARB 26.14524 181115.260
# Plot variable importance
varImpPlot(
rf_model,
main = "Random Forest Variable Importance"
)
#Build Model 3: Gradient Boosting Regression
# install.packages("gbm")
library(gbm)
## Warning: package 'gbm' was built under R version 4.5.3
## Loaded gbm 2.2.3
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
set.seed(123)
gbm_model <- gbm(
avg_systolic ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR +
DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
data = train_data,
distribution = "gaussian", # Regression outcome
n.trees = 1000, # Number of boosting trees
interaction.depth = 3, # Tree depth
shrinkage = 0.01, # Learning rate
n.minobsinnode = 10, # Minimum observations per terminal node
cv.folds = 5, # Cross-validation
verbose = FALSE
)
# Find best number of trees
best_trees <- gbm.perf(
gbm_model,
method = "cv"
)
# Predict on test data
gbm_pred <- predict(
gbm_model,
newdata = test_data,
n.trees = best_trees
)
# Variable importance
summary(
gbm_model,
n.trees = best_trees
)
## var rel.inf
## RIDAGEYR RIDAGEYR 90.865076619
## RIAGENDR RIAGENDR 2.627306075
## DR1TPROT DR1TPROT 2.228053198
## DR1TSUGR DR1TSUGR 1.764978062
## DR1TTFAT DR1TTFAT 0.906583248
## DR1TCARB DR1TCARB 0.813591039
## DR1TKCAL DR1TKCAL 0.771105903
## choc_grams choc_grams 0.015793423
## chocolate chocolate 0.007512434
#Define Evaluation Metrics
# 6. CREATE EVALUATION FUNCTIONS
rmse_fun <- function(actual, predicted) {
sqrt(mean((actual - predicted)^2))
}
mae_fun <- function(actual, predicted) {
mean(abs(actual - predicted))
}
r2_fun <- function(actual, predicted) {
cor(actual, predicted)^2
}
#Compare All Three Models
model_comparison <- data.frame(
Model = c(
"Linear Regression",
"Random Forest Regression",
"Gradient Boosting Regression"
),
RMSE = c(
rmse_fun(test_data$avg_systolic, lm_pred),
rmse_fun(test_data$avg_systolic, rf_pred),
rmse_fun(test_data$avg_systolic, gbm_pred)
),
MAE = c(
mae_fun(test_data$avg_systolic, lm_pred),
mae_fun(test_data$avg_systolic, rf_pred),
mae_fun(test_data$avg_systolic, gbm_pred)
),
R_squared = c(
r2_fun(test_data$avg_systolic, lm_pred),
r2_fun(test_data$avg_systolic, rf_pred),
r2_fun(test_data$avg_systolic, gbm_pred)
)
)
# Print comparison table
print(model_comparison)
## Model RMSE MAE R_squared
## 1 Linear Regression 16.38001 12.08913 0.3343688
## 2 Random Forest Regression 16.50058 12.23541 0.3257989
## 3 Gradient Boosting Regression 16.20762 11.98067 0.3475444
#Choose Best Model Automatically
best_model_rmse <- model_comparison %>%
arrange(RMSE) %>%
slice(1)
print(best_model_rmse)
## Model RMSE MAE R_squared
## 1 Gradient Boosting Regression 16.20762 11.98067 0.3475444
Lower RMSE = better model Lower MAE = better model Higher R_squared = better model
#Bar Chart: RMSE Comparison
ggplot(model_comparison, aes(x = Model, y = RMSE, fill = Model)) +
geom_col(width = 0.7) +
geom_text(aes(label = round(RMSE, 2)), vjust = -0.5, size = 4) +
labs(
title = "Comparison of Regression Models Using RMSE",
x = "Machine Learning Model",
y = "RMSE"
) +
scale_fill_brewer(palette = "Set2") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 25, hjust = 1),
legend.position = "none"
)
#Bar Chart: R-Squared Comparison
# =========================================================
# 10. VISUALIZE R-SQUARED COMPARISON
# =========================================================
ggplot(model_comparison, aes(x = Model, y = R_squared, fill = Model)) +
geom_col(width = 0.7) +
geom_text(aes(label = round(R_squared, 3)), vjust = -0.5, size = 4) +
labs(
title = "Comparison of Regression Models Using R-Squared",
x = "Machine Learning Model",
y = "R-Squared"
) +
scale_fill_brewer(palette = "Set2") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 25, hjust = 1),
legend.position = "none"
)
#Actual vs Predicted value Plots
prediction_results <- data.frame(
Actual = test_data$avg_systolic,
Linear_Regression = lm_pred,
Random_Forest = rf_pred,
Gradient_Boosting = gbm_pred
)
head(prediction_results)
## Actual Linear_Regression Random_Forest Gradient_Boosting
## 10 114.0000 138.3747 142.6584 135.2758
## 14 101.3333 104.9084 101.2164 105.3998
## 20 124.0000 131.0123 129.8408 129.0020
## 27 109.3333 105.6999 103.2622 105.4167
## 38 116.0000 138.1187 132.8147 134.3736
## 39 104.0000 112.2495 110.0748 112.0120
Conclusion:- This table shows that the models are able to predict many systolic BP values near the actual values, especially in the normal or moderate range. However, for very high blood pressure values, such as 162, all models underestimated the actual BP. This suggests that the models may need stronger predictors, more data, or additional clinical variables to better capture high blood pressure cases.
prediction_compare <- data.frame(
Actual = test_data$avg_systolic,
Linear_Regression = lm_pred,
Random_Forest = rf_pred,
Gradient_Boosting = gbm_pred
)
library(dplyr)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.3
library(ggplot2)
prediction_long <- prediction_compare %>%
pivot_longer(
cols = c(Linear_Regression, Random_Forest, Gradient_Boosting),
names_to = "Model",
values_to = "Predicted"
)
prediction_sample <- prediction_compare %>%
slice(1:20) %>%
mutate(Row = row_number())
prediction_sample_long <- prediction_sample %>%
pivot_longer(
cols = c(Actual, Linear_Regression, Random_Forest, Gradient_Boosting),
names_to = "Type",
values_to = "BP_Value"
)
ggplot(prediction_sample_long, aes(x = Row, y = BP_Value, color = Type)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
labs(
title = "Actual vs Predicted Systolic BP for Sample Test Observations",
subtitle = "Comparison across Linear Regression, Random Forest, and Gradient Boosting",
x = "Test Data Row",
y = "Average Systolic BP",
color = "Actual / Predicted"
) +
theme_minimal()
This graph is clearly shows how close each model prediction is to the
actual value.
#Add Error Column for Model Comparison
prediction_error <- prediction_compare %>%
mutate(
Error_Linear_Regression = abs(Actual - Linear_Regression),
Error_Random_Forest = abs(Actual - Random_Forest),
Error_Gradient_Boosting = abs(Actual - Gradient_Boosting)
)
head(prediction_error)
## Actual Linear_Regression Random_Forest Gradient_Boosting
## 10 114.0000 138.3747 142.6584 135.2758
## 14 101.3333 104.9084 101.2164 105.3998
## 20 124.0000 131.0123 129.8408 129.0020
## 27 109.3333 105.6999 103.2622 105.4167
## 38 116.0000 138.1187 132.8147 134.3736
## 39 104.0000 112.2495 110.0748 112.0120
## Error_Linear_Regression Error_Random_Forest Error_Gradient_Boosting
## 10 24.374728 28.6584177 21.275770
## 14 3.575082 0.1169215 4.066419
## 20 7.012342 5.8408382 5.001980
## 27 3.633438 6.0711474 3.916661
## 38 22.118657 16.8147292 18.373597
## 39 8.249469 6.0748453 8.012028
#Error Comparison Boxplot
error_long <- prediction_error %>%
select(
Error_Linear_Regression,
Error_Random_Forest,
Error_Gradient_Boosting
) %>%
pivot_longer(
cols = everything(),
names_to = "Model",
values_to = "Absolute_Error"
)
ggplot(error_long, aes(x = Model, y = Absolute_Error, fill = Model)) +
geom_boxplot(alpha = 0.7) +
labs(
title = "Prediction Error Comparison Across Models",
subtitle = "Lower absolute error indicates better model performance",
x = "Model",
y = "Absolute Prediction Error"
) +
theme_minimal() +
theme(legend.position = "none")
#========================================================= #AVERAGE
BLOOD PRESSURE REDUCTION GRAPH #Moderate Chocolate/Cocoa Intake vs No
Intake #=========================================================
library(dplyr)
library(tidyr)
library(ggplot2)
library(dplyr)
library(tidyr)
library(ggplot2)
# =========================================================
# Create analysis_data from final_data
# =========================================================
analysis_data <- final_data %>%
mutate(
BPXSY1 = as.numeric(BPXSY1),
BPXSY2 = as.numeric(BPXSY2),
BPXSY3 = as.numeric(BPXSY3),
BPXSY4 = as.numeric(BPXSY4),
BPXDI1 = as.numeric(BPXDI1),
BPXDI2 = as.numeric(BPXDI2),
BPXDI3 = as.numeric(BPXDI3),
BPXDI4 = as.numeric(BPXDI4),
avg_systolic = rowMeans(
select(., BPXSY1, BPXSY2, BPXSY3, BPXSY4),
na.rm = TRUE
),
avg_diastolic = rowMeans(
select(., BPXDI1, BPXDI2, BPXDI3, BPXDI4),
na.rm = TRUE
),
avg_systolic = ifelse(is.nan(avg_systolic), NA, avg_systolic),
avg_diastolic = ifelse(is.nan(avg_diastolic), NA, avg_diastolic),
chocolate_level = case_when(
choc_grams == 0 ~ "None",
choc_grams > 0 & choc_grams <= 25 ~ "Low",
choc_grams > 25 & choc_grams <= 50 ~ "Moderate",
choc_grams > 50 ~ "High",
TRUE ~ NA_character_
)
)
# =========================================================
# Average BP Reduction: None vs Moderate Chocolate/Cocoa Intake
# =========================================================
# 1. Keep only None and Moderate groups
bp_reduction_data <- analysis_data %>%
filter(chocolate_level %in% c("None", "Moderate")) %>%
select(chocolate_level, avg_systolic, avg_diastolic)
# Check counts before plotting
table(bp_reduction_data$chocolate_level, useNA = "ifany")
##
## Moderate None
## 15 8654
bp_long <- bp_reduction_data %>%
pivot_longer(
cols = c(avg_systolic, avg_diastolic),
names_to = "BP_Type",
values_to = "BP_Value"
) %>%
filter(!is.na(BP_Value)) %>%
mutate(
BP_Type = case_when(
BP_Type == "avg_systolic" ~ "Systolic BP",
BP_Type == "avg_diastolic" ~ "Diastolic BP"
)
)
# Check how many valid BP values are available
bp_long %>%
count(chocolate_level, BP_Type)
## # A tibble: 4 × 3
## chocolate_level BP_Type n
## <chr> <chr> <int>
## 1 Moderate Diastolic BP 10
## 2 Moderate Systolic BP 10
## 3 None Diastolic BP 6678
## 4 None Systolic BP 6678
bp_summary <- bp_long %>%
group_by(chocolate_level, BP_Type) %>%
summarise(
mean_bp = mean(BP_Value, na.rm = TRUE),
sd_bp = sd(BP_Value, na.rm = TRUE),
n = n(),
se = ifelse(n > 1, sd_bp / sqrt(n), 0),
.groups = "drop"
)
bp_summary
## # A tibble: 4 × 6
## chocolate_level BP_Type mean_bp sd_bp n se
## <chr> <chr> <dbl> <dbl> <int> <dbl>
## 1 Moderate Diastolic BP 75.9 6.72 10 2.12
## 2 Moderate Systolic BP 134. 23.6 10 7.47
## 3 None Diastolic BP 68.3 15.8 6678 0.193
## 4 None Systolic BP 122. 20.3 6678 0.249
bp_reduction_summary <- bp_summary %>%
select(BP_Type, chocolate_level, mean_bp, se, n) %>%
pivot_wider(
names_from = chocolate_level,
values_from = c(mean_bp, se, n),
names_glue = "{.value}_{chocolate_level}"
) %>%
mutate(
Reduction = mean_bp_None - mean_bp_Moderate,
Error = sqrt(se_None^2 + se_Moderate^2)
)
bp_reduction_summary
## # A tibble: 2 × 9
## BP_Type mean_bp_Moderate mean_bp_None se_Moderate se_None n_Moderate n_None
## <chr> <dbl> <dbl> <dbl> <dbl> <int> <int>
## 1 Diastolic… 75.9 68.3 2.12 0.193 10 6678
## 2 Systolic … 134. 122. 7.47 0.249 10 6678
## # ℹ 2 more variables: Reduction <dbl>, Error <dbl>
ggplot(bp_reduction_summary, aes(x = BP_Type, y = Reduction, fill = BP_Type)) +
geom_col(width = 0.65, alpha = 0.85) +
geom_errorbar(
aes(
ymin = Reduction - Error,
ymax = Reduction + Error
),
width = 0.15,
linewidth = 0.8,
color = "black"
) +
geom_text(
aes(label = round(Reduction, 2)),
vjust = ifelse(bp_reduction_summary$Reduction >= 0, -0.6, 1.3),
size = 5
) +
labs(
title = "Average Blood Pressure Difference (Moderate Cocoa Intake)",
subtitle = "Calculated as No Intake group mean minus Moderate Intake group mean",
x = "",
y = "Difference in Blood Pressure (mmHg)"
) +
theme_minimal(base_size = 14) +
theme(
legend.position = "none",
plot.title = element_text(face = "bold", hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.text.x = element_text(face = "bold"),
panel.grid.minor = element_blank()
)
based on NHANES data, the Moderate chocolate/cocoa group has higher
average BP than the None group, not lower BP.
In this dataset, the moderate chocolate/cocoa intake group had higher average systolic and diastolic blood pressure compared with the no-intake group. Therefore, the calculated reduction values are negative. This means the data does not show an average blood pressure reduction for the moderate intake group. However, the moderate group has only 10 complete BP observations compared with 6,678 observations in the no-intake group, so this result should be interpreted carefully as an exploratory observational finding.
#ROC Curve and AUC for All Models # ========================================================= # ROC CURVE AND AUC COMPARISON FOR ALL MODELS # Models: Linear Regression, Random Forest, Gradient Boosting # Outcome converted to binary: High BP vs Normal BP # =========================================================
# Install only once if needed
# install.packages("pROC")
library(pROC)
library(dplyr)
library(ggplot2)
library(knitr)
# -------------------------------
# 1. Create actual High BP class
# -------------------------------
actual_class <- ifelse(test_data$avg_systolic >= 130, 1, 0)
# 1 = High BP
# 0 = Normal BP
# -------------------------------
# 2. Create ROC objects
# -------------------------------
roc_lm <- roc(
response = actual_class,
predictor = lm_pred
)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_rf <- roc(
response = actual_class,
predictor = rf_pred
)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_gbm <- roc(
response = actual_class,
predictor = gbm_pred
)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# -------------------------------
# 3. Calculate AUC values
# -------------------------------
auc_lm <- auc(roc_lm)
auc_rf <- auc(roc_rf)
auc_gbm <- auc(roc_gbm)
auc_lm
## Area under the curve: 0.8166
auc_rf
## Area under the curve: 0.8043
auc_gbm
## Area under the curve: 0.8205
#AUC Comparison Table
auc_table <- data.frame(
Model = c(
"Linear Regression",
"Random Forest Regression",
"Gradient Boosting Regression"
),
AUC = c(
as.numeric(auc_lm),
as.numeric(auc_rf),
as.numeric(auc_gbm)
)
)
auc_table <- auc_table %>%
mutate(AUC = round(AUC, 3)) %>%
arrange(desc(AUC))
kable(
auc_table,
caption = "AUC Comparison of Machine Learning Models",
align = "c"
)
| Model | AUC |
|---|---|
| Gradient Boosting Regression | 0.820 |
| Linear Regression | 0.817 |
| Random Forest Regression | 0.804 |
#Plot ROC Curves Together
plot(
roc_lm,
col = "blue",
lwd = 2,
main = "ROC Curve Comparison for High Blood Pressure Prediction"
)
plot(
roc_rf,
col = "red",
lwd = 2,
add = TRUE
)
plot(
roc_gbm,
col = "darkgreen",
lwd = 2,
add = TRUE
)
legend(
"bottomright",
legend = c(
paste("Linear Regression AUC =", round(auc_lm, 3)),
paste("Random Forest AUC =", round(auc_rf, 3)),
paste("Gradient Boosting AUC =", round(auc_gbm, 3))
),
col = c("blue", "red", "darkgreen"),
lwd = 2
)
# Convert ROC objects into data frames
roc_lm_df <- data.frame(
Specificity = roc_lm$specificities,
Sensitivity = roc_lm$sensitivities,
Model = "Linear Regression"
)
roc_rf_df <- data.frame(
Specificity = roc_rf$specificities,
Sensitivity = roc_rf$sensitivities,
Model = "Random Forest Regression"
)
roc_gbm_df <- data.frame(
Specificity = roc_gbm$specificities,
Sensitivity = roc_gbm$sensitivities,
Model = "Gradient Boosting Regression"
)
roc_all_df <- rbind(roc_lm_df, roc_rf_df, roc_gbm_df)
ggplot(roc_all_df, aes(x = 1 - Specificity, y = Sensitivity, color = Model)) +
geom_line(size = 1.0) +
geom_abline(
intercept = 0,
slope = 1,
linetype = "dashed",
color = "gray"
) +
labs(
title = "ROC Curve Comparison of Machine Learning Models",
subtitle = "High BP defined as average systolic blood pressure ≥ 130 mmHg",
x = "False Positive Rate (1 - Specificity)",
y = "True Positive Rate (Sensitivity)",
color = "Model"
) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#========================================================= #FINAL MACHINE LEARNING MODEL COMPARISON TABLE #Regression Metrics + Classification Metrics #Outcome: avg_systolic #Classification threshold: High BP if systolic BP >= 130 #=========================================================
library(dplyr)
library(knitr)
# -------------------------------
# 1. Regression Metric Functions
# -------------------------------
rmse_fun <- function(actual, predicted) {
sqrt(mean((actual - predicted)^2, na.rm = TRUE))
}
mae_fun <- function(actual, predicted) {
mean(abs(actual - predicted), na.rm = TRUE)
}
r2_fun <- function(actual, predicted) {
cor(actual, predicted, use = "complete.obs")^2
}
# -------------------------------
# 2. Classification Metric Function
# -------------------------------
class_metrics_fun <- function(actual_bp, predicted_bp, threshold = 130) {
actual_class <- ifelse(actual_bp >= threshold, "High BP", "Normal BP")
predicted_class <- ifelse(predicted_bp >= threshold, "High BP", "Normal BP")
actual_class <- factor(actual_class, levels = c("High BP", "Normal BP"))
predicted_class <- factor(predicted_class, levels = c("High BP", "Normal BP"))
cm <- table(Predicted = predicted_class, Actual = actual_class)
TP <- cm["High BP", "High BP"]
TN <- cm["Normal BP", "Normal BP"]
FP <- cm["High BP", "Normal BP"]
FN <- cm["Normal BP", "High BP"]
accuracy <- (TP + TN) / sum(cm)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
specificity <- TN / (TN + FP)
f1_score <- 2 * ((precision * recall) / (precision + recall))
return(
c(
Accuracy = accuracy,
Precision = precision,
Recall = recall,
Specificity = specificity,
F1_Score = f1_score
)
)
}
# -------------------------------
# 3. Calculate Classification Metrics
# -------------------------------
lm_class_metrics <- class_metrics_fun(test_data$avg_systolic, lm_pred)
rf_class_metrics <- class_metrics_fun(test_data$avg_systolic, rf_pred)
gbm_class_metrics <- class_metrics_fun(test_data$avg_systolic, gbm_pred)
# -------------------------------
# 4. Create Final Comparison Table
# -------------------------------
model_comparison_final <- data.frame(
Model = c(
"Linear Regression",
"Random Forest Regression",
"Gradient Boosting Regression"
),
RMSE = c(
rmse_fun(test_data$avg_systolic, lm_pred),
rmse_fun(test_data$avg_systolic, rf_pred),
rmse_fun(test_data$avg_systolic, gbm_pred)
),
MAE = c(
mae_fun(test_data$avg_systolic, lm_pred),
mae_fun(test_data$avg_systolic, rf_pred),
mae_fun(test_data$avg_systolic, gbm_pred)
),
R_Squared = c(
r2_fun(test_data$avg_systolic, lm_pred),
r2_fun(test_data$avg_systolic, rf_pred),
r2_fun(test_data$avg_systolic, gbm_pred)
),
Accuracy = c(
lm_class_metrics["Accuracy"],
rf_class_metrics["Accuracy"],
gbm_class_metrics["Accuracy"]
),
Precision = c(
lm_class_metrics["Precision"],
rf_class_metrics["Precision"],
gbm_class_metrics["Precision"]
),
Recall = c(
lm_class_metrics["Recall"],
rf_class_metrics["Recall"],
gbm_class_metrics["Recall"]
),
Specificity = c(
lm_class_metrics["Specificity"],
rf_class_metrics["Specificity"],
gbm_class_metrics["Specificity"]
),
F1_Score = c(
lm_class_metrics["F1_Score"],
rf_class_metrics["F1_Score"],
gbm_class_metrics["F1_Score"]
)
)
# Round values for clean presentation
model_comparison_final <- model_comparison_final %>%
mutate(across(where(is.numeric), ~ round(.x, 3)))
# Print nice statistical table
kable(
model_comparison_final,
caption = "Machine Learning Model Comparison for Predicting Average Systolic Blood Pressure",
align = "c"
)
| Model | RMSE | MAE | R_Squared | Accuracy | Precision | Recall | Specificity | F1_Score |
|---|---|---|---|---|---|---|---|---|
| Linear Regression | 16.380 | 12.089 | 0.334 | 0.742 | 0.529 | 0.622 | 0.787 | 0.572 |
| Random Forest Regression | 16.501 | 12.235 | 0.326 | 0.738 | 0.522 | 0.617 | 0.784 | 0.566 |
| Gradient Boosting Regression | 16.208 | 11.981 | 0.348 | 0.745 | 0.531 | 0.681 | 0.769 | 0.597 |
Interpetation :-
To compare the machine learning pipelines, the same cleaned NHANES dataset was used for all models. The outcome variable was average systolic blood pressure, and the predictor variables included chocolate/cocoa intake, chocolate grams, age, gender, total calories, sugar, fat, protein, and carbohydrate intake. The dataset was split into 80% training and 20% testing data. Three regression models were developed: Linear Regression, Random Forest Regression, and Gradient Boosting Regression. Model performance was evaluated using RMSE, MAE, and R-squared. RMSE and MAE measured prediction error, while R-squared measured the proportion of variation in systolic blood pressure explained by the model.
The best-performing model was selected based on the lowest RMSE and MAE and the highest R-squared. Linear Regression was useful for interpretation and identifying statistically significant predictors. Random Forest Regression and Gradient Boosting Regression were useful for capturing non-linear relationships between dietary intake, demographic factors, and systolic blood pressure.
The machine learning pipelines were compared using identical training and testing datasets to ensure fairness. Linear Regression served as the baseline model, while Random Forest Regression and Gradient Boosting Regression were used to evaluate whether non-linear machine learning methods improved prediction of average systolic blood pressure. Performance was compared using RMSE, MAE, and R-squared. The model with the lowest RMSE and MAE and highest R-squared was considered the best predictive model. Variable importance from Random Forest and Gradient Boosting was also examined to determine whether chocolate/cocoa intake contributed meaningfully compared with stronger predictors such as age, gender, and total dietary intake.
COSMOS by saying that clinical trial evidence found cocoa extract did not significantly reduce total cardiovascular events, although secondary analyses suggested reduced CVD death; therefore, your NHANES ML analysis should be interpreted as an exploratory prediction study rather than causal evidence.