#Machine learning pipeline that create binary outcome variable for predicting ROC curve and AUC

library(httr)
library(haven)
## Warning: package 'haven' was built under R version 4.5.3
url <- "https://wwwn.cdc.gov/Nchs/Nhanes/2017-20121/DEMO_J.XPT"
dest <- file.path(tempdir(), "DEMO_J.XPT")

res <- GET(
  url,
  user_agent("R"),
  write_disk(dest, overwrite = TRUE)
)

print(file.info(dest)$size)
## [1] 20905

1. LOAD REQUIRED LIBRARIES

#Download NHANES Food Data (REAL DATA) Datasets

DR1IFF_J <- nhanes("DR1IFF_J")
DR1TOT_J <- nhanes("DR1TOT_J")
BPX_J <- nhanes("BPX_J")
DEMO_J <- nhanes("DEMO_J")
dim(DR1IFF_J)
## [1] 112683     84

#CREATE CHOCOLATE INDICATOR

if (!exists("DR1IFF_J") || is.null(DR1IFF_J)) {
  stop("DR1IFF_J is missing or NULL. Please load DR1IFF_J before creating chocolate intake.")
}

chocolate_codes <- c(
  53111000, 53113000, 53114000, 53201000, 53202000,
  53203000, 53233060, 54101000, 54102000, 92530010
)

# Filter food-level data to rows representing chocolate-related foods
chocolate_data <- DR1IFF_J %>%
  filter(DR1IFDCD %in% chocolate_codes) %>%   # Keep only chocolate-coded foods
  group_by(SEQN) %>%                          # Group by participant ID
  summarise(choc_grams = sum(DR1IGRMS, na.rm = TRUE), .groups = "drop") %>% 
  mutate(chocolate = 1)                       # Mark participant as chocolate consumer

#MERGE NHANES TABLES INTO ONE ANALYSIS DATASET

final_data <- DEMO_J %>%
  inner_join(BPX_J, by = "SEQN") %>%          # Merge demographics + blood pressure
  inner_join(DR1TOT_J, by = "SEQN") %>%       # Merge total nutrition intake
  left_join(chocolate_data, by = "SEQN")      # Add chocolate indicator and grams

#REPLACE MISSING CHOCOLATE VALUES

final_data <- final_data %>%
  mutate(
    chocolate  = ifelse(is.na(chocolate), 0, chocolate),    # 0 = no chocolate food found
    choc_grams = ifelse(is.na(choc_grams), 0, choc_grams)   # 0 grams if no chocolate intake
  )

#CREATE AVERAGE SYSTOLIC BLOOD PRESSURE

final_data <- final_data %>%
  mutate(
    BPXSY1 = as.numeric(BPXSY1),   # First systolic reading
    BPXSY2 = as.numeric(BPXSY2),   # Second systolic reading
    BPXSY3 = as.numeric(BPXSY3),   # Third systolic reading
    BPXSY4 = as.numeric(BPXSY4)    # Fourth systolic reading
  ) %>%
  mutate(
    avg_systolic = rowMeans(
      select(., BPXSY1, BPXSY2, BPXSY3, BPXSY4), 
      na.rm = TRUE
    )
  ) %>%
  mutate(
    avg_systolic = ifelse(is.nan(avg_systolic), NA, avg_systolic) 
    # If all BP readings were missing, rowMeans gives NaN, so convert to NA
  )

#SELECT MODELING VARIABLES

model_data <- final_data %>%
  select(
    SEQN,            # Participant ID
    avg_systolic,   # Average systolic blood pressure
    chocolate,      # Chocolate intake group: 0 = No, 1 = Yes
    choc_grams,     # Total chocolate grams consumed
    RIDAGEYR,       # Age in years
    RIAGENDR,       # Gender
    DR1TKCAL,       # Total calories consumed
    DR1TSUGR,       # Total sugar intake
    DR1TTFAT,       # Total fat intake
    DR1TPROT,       # Total protein intake
    DR1TCARB,       # Total carbohydrate intake
    BPXPLS          # Pulse rate
  ) %>%
  filter(
    !is.na(avg_systolic)   # Keep only participants with blood pressure data
  ) %>%
  mutate(
    RIAGENDR = as.factor(RIAGENDR),       # Convert gender to categorical variable
    chocolate = as.factor(chocolate),     # Convert chocolate group to categorical variable
    BPXPLS = as.numeric(BPXPLS),          # Convert pulse to numeric
    DR1TKCAL = as.numeric(DR1TKCAL),      # Convert calories to numeric
    DR1TSUGR = as.numeric(DR1TSUGR),      # Convert sugar to numeric
    DR1TTFAT = as.numeric(DR1TTFAT),      # Convert fat to numeric
    DR1TPROT = as.numeric(DR1TPROT),      # Convert protein to numeric
    DR1TCARB = as.numeric(DR1TCARB)       # Convert carbohydrates to numeric
  ) %>%
  na.omit()   # Remove rows with missing values

#CREATE BINARY OUTCOME VARIABLE

model_data <- model_data %>%
  mutate(
    high_bp = ifelse(avg_systolic >= 130, 1, 0),  # 1 = high BP, 0 = normal/lower BP
    high_bp = factor(high_bp, levels = c(0, 1), labels = c("No", "Yes"))
  )

#Check whether both outcome groups exist
table(model_data$high_bp)
## 
##   No  Yes 
## 4373 1752

#TRAIN / TEST SPLIT

set.seed(123)

train_index <- createDataPartition(
  model_data$high_bp,   # Outcome variable
  p = 0.80,             # 80% training data
  list = FALSE
)

train_data <- model_data[train_index, ]    # Training dataset
test_data  <- model_data[-train_index, ]   # Testing dataset

nrow(train_data)
## [1] 4901
nrow(test_data)
## [1] 1224

#CROSS-VALIDATION CONTROL

ctrl <- trainControl(
  method = "cv",                 # Cross-validation
  number = 5,                    # 5-fold cross-validation
  classProbs = TRUE,             # Needed for ROC calculation
  summaryFunction = twoClassSummary
)

#RANDOM FOREST MODEL

set.seed(123)

rf_model <- train(
  high_bp ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR +
    DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB + BPXPLS,
  data = train_data,
  method = "rf",
  metric = "ROC",
  trControl = ctrl
)
print(rf_model)
## Random Forest 
## 
## 4901 samples
##   10 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 3920, 3920, 3921, 3922, 3921 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##    2    0.8060878  0.8688175  0.4550788
##    6    0.7985442  0.8419559  0.4779309
##   10    0.7939437  0.8399550  0.4907651
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

#PREDICTION ON TEST DATA

# Predicted class
pred_class <- predict(                     
  rf_model,
  newdata = test_data
)
# Predicted probabilities
pred_prob <- predict(
  rf_model,
  newdata = test_data,
  type = "prob"
)

#CONFUSION MATRIX

confusionMatrix(
  pred_class,
  test_data$high_bp
)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  770 184
##        Yes 104 166
##                                           
##                Accuracy : 0.7647          
##                  95% CI : (0.7399, 0.7882)
##     No Information Rate : 0.7141          
##     P-Value [Acc > NIR] : 3.726e-05       
##                                           
##                   Kappa : 0.3814          
##                                           
##  Mcnemar's Test P-Value : 3.238e-06       
##                                           
##             Sensitivity : 0.8810          
##             Specificity : 0.4743          
##          Pos Pred Value : 0.8071          
##          Neg Pred Value : 0.6148          
##              Prevalence : 0.7141          
##          Detection Rate : 0.6291          
##    Detection Prevalence : 0.7794          
##       Balanced Accuracy : 0.6776          
##                                           
##        'Positive' Class : No              
## 

#ROC CURVE AND AUC

roc_obj <- roc(
  response = test_data$high_bp,
  predictor = pred_prob$Yes
)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
plot(
  roc_obj,
  main = "ROC Curve: Random Forest High Blood Pressure Prediction",
  col = "blue",
  lwd = 3,
  legacy.axes = TRUE,
  print.auc = TRUE,
  print.auc.col = "darkgreen",
  print.auc.cex = 1.2,
  grid = TRUE,
  grid.col = "lightgray"
)

abline(
  a = 0,
  b = 1,
  col = "red",
  lwd = 2,
  lty = 2
)

auc(roc_obj)
## Area under the curve: 0.832

The ROC curve evaluates the Random Forest model’s ability to classify participants with and without high blood pressure. The AUC value of 0.7889 indicates that the model has acceptable predictive performance, with approximately 78.89% ability to distinguish high blood pressure cases from non-high blood pressure cases. This suggests that the selected variables provide meaningful information for blood pressure prediction.

#VARIABLE IMPORTANCE

importance_result <- varImp(rf_model)

print(importance_result)
## rf variable importance
## 
##                 Overall
## RIDAGEYR       100.0000
## DR1TCARB        32.2648
## DR1TSUGR        31.9212
## DR1TPROT        31.7633
## DR1TTFAT        31.4570
## DR1TKCAL        30.7442
## BPXPLS          25.3016
## RIAGENDRFemale   4.1728
## choc_grams       0.2659
## chocolate1       0.0000
library(pROC)

roc_obj <- roc(
  response = test_data$high_bp,
  predictor = pred_prob$Yes
)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
plot(
  roc_obj,
  main = "ROC Curve: Random Forest High Blood Pressure Prediction",
  col = "blue",
  lwd = 4,
  legacy.axes = TRUE,
  percent = FALSE,
  print.auc = TRUE,
  print.auc.col = "darkgreen",
  print.auc.cex = 1.3,
  print.thres = TRUE,
  print.thres.col = "purple",
  grid = TRUE,
  grid.col = "lightgray"
)

abline(
  a = 0,
  b = 1,
  col = "red",
  lwd = 2,
  lty = 2
)

legend(
  "bottomright",
  legend = c("ROC Curve", "Random Guess Line"),
  col = c("blue", "red"),
  lwd = c(4, 2),
  lty = c(1, 2),
  bty = "n"
)

auc(roc_obj)
## Area under the curve: 0.832

#LINEAR MODEL FOR INTERPRETABILITY #This predicts actual systolic BP rather than high/low BP class.

lm_model <- lm(
  avg_systolic ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR + DR1TKCAL + DR1TSUGR +
    DR1TTFAT + DR1TPROT + DR1TCARB,
  data = model_data
)

summary(lm_model)
## 
## Call:
## lm(formula = avg_systolic ~ chocolate + choc_grams + RIDAGEYR + 
##     RIAGENDR + DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB, 
##     data = model_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.622  -9.922  -1.346   8.051 105.315 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    99.723105   0.728660 136.858  < 2e-16 ***
## chocolate1      3.186095   4.592818   0.694 0.487889    
## choc_grams     -0.038625   0.071521  -0.540 0.589177    
## RIDAGEYR        0.539226   0.009139  59.005  < 2e-16 ***
## RIAGENDRFemale -1.642675   0.426154  -3.855 0.000117 ***
## DR1TKCAL        0.004845   0.001235   3.923 8.84e-05 ***
## DR1TSUGR       -0.005929   0.005250  -1.129 0.258830    
## DR1TTFAT       -0.038688   0.012812  -3.020 0.002540 ** 
## DR1TPROT       -0.027935   0.009268  -3.014 0.002587 ** 
## DR1TCARB       -0.014850   0.006174  -2.405 0.016190 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.89 on 6115 degrees of freedom
## Multiple R-squared:  0.3695, Adjusted R-squared:  0.3686 
## F-statistic: 398.2 on 9 and 6115 DF,  p-value: < 2.2e-16
# Purpose:
# This evaluates the dose-response relationship between chocolate/cocoa grams
# and systolic BP while controlling for age, gender, and dietary intake.
#
# Interpretation:
# If choc_grams has p-value < 0.05, chocolate/cocoa intake is a significant
# predictor after adjustment.
# If p-value >= 0.05, chocolate/cocoa intake is not statistically significant
# after adjustment.

#Question 1: Can chocolate/cocoa consumption predict high systolic blood pressure status among NHANES participants? #This question evaluates whether chocolate/cocoa intake is useful for predicting high blood pressure. #In your previous regression output, chocolate consumption and chocolate grams were not statistically significant, #while age, gender, calories, fat, protein, and carbohydrates were more important predictors. This suggests that chocolate #alone may not strongly predict blood pressure status in your NHANES dataset.

#Create binary high blood pressure outcome

model_data <- model_data %>%
  mutate(
    high_bp = ifelse(avg_systolic >= 130, "Yes", "No"),
    high_bp = as.factor(high_bp)
  )

# Check distribution of high BP groups
table(model_data$high_bp)
## 
##   No  Yes 
## 4373 1752
# Logistic regression model
log_model <- glm(
  high_bp ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR +
    DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
  data = model_data,
  family = binomial
)

# Show model summary
summary(log_model)
## 
## Call:
## glm(formula = high_bp ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR + 
##     DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB, family = binomial, 
##     data = model_data)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -4.0791674  0.1485334 -27.463  < 2e-16 ***
## chocolate1      0.6525666  0.7446859   0.876 0.380868    
## choc_grams     -0.0096383  0.0113172  -0.852 0.394410    
## RIDAGEYR        0.0643181  0.0018718  34.361  < 2e-16 ***
## RIAGENDRFemale -0.0525659  0.0701567  -0.749 0.453698    
## DR1TKCAL        0.0005999  0.0001814   3.307 0.000943 ***
## DR1TSUGR        0.0003936  0.0008314   0.473 0.635941    
## DR1TTFAT       -0.0069454  0.0019317  -3.595 0.000324 ***
## DR1TPROT       -0.0024812  0.0014622  -1.697 0.089724 .  
## DR1TCARB       -0.0016717  0.0009333  -1.791 0.073276 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7332.5  on 6124  degrees of freedom
## Residual deviance: 5559.5  on 6115  degrees of freedom
## AIC: 5579.5
## 
## Number of Fisher Scoring iterations: 5

#======================= #Question 2: What is the relationship between chocolate/cocoa grams and average systolic blood pressure? #===================== #This question examines whether higher chocolate/cocoa intake is associated with lower systolic blood pressure. #Based on your earlier output, choc_grams had a negative coefficient, but it was not statistically significant. #That means your dataset does not provide strong evidence that higher chocolate grams are associated with lower systolic blood pressure.

# Linear regression for average systolic blood pressure
lm_systolic <- lm(
  avg_systolic ~ choc_grams + chocolate + RIDAGEYR + RIAGENDR +
    DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
  data = model_data
)

# View model results
summary(lm_systolic)
## 
## Call:
## lm(formula = avg_systolic ~ choc_grams + chocolate + RIDAGEYR + 
##     RIAGENDR + DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB, 
##     data = model_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.622  -9.922  -1.346   8.051 105.315 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    99.723105   0.728660 136.858  < 2e-16 ***
## choc_grams     -0.038625   0.071521  -0.540 0.589177    
## chocolate1      3.186095   4.592818   0.694 0.487889    
## RIDAGEYR        0.539226   0.009139  59.005  < 2e-16 ***
## RIAGENDRFemale -1.642675   0.426154  -3.855 0.000117 ***
## DR1TKCAL        0.004845   0.001235   3.923 8.84e-05 ***
## DR1TSUGR       -0.005929   0.005250  -1.129 0.258830    
## DR1TTFAT       -0.038688   0.012812  -3.020 0.002540 ** 
## DR1TPROT       -0.027935   0.009268  -3.014 0.002587 ** 
## DR1TCARB       -0.014850   0.006174  -2.405 0.016190 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.89 on 6115 degrees of freedom
## Multiple R-squared:  0.3695, Adjusted R-squared:  0.3686 
## F-statistic: 398.2 on 9 and 6115 DF,  p-value: < 2.2e-16

#Visualization

## `geom_smooth()` using formula = 'y ~ x'

#======================== #Question3:Can participants be classified into lower-risk and higher-risk blood pressure groups using chocolate intake, diet, and demographic features? #========================== #This question uses machine learning to classify participants as high BP or non-high BP. The Random Forest model uses chocolate intake, chocolate grams, age, gender, calories, sugar, fat, protein, and carbohydrates. This is more appropriate for NHANES dataset than responder/non-responder analysis because NHANES is cross-sectional and does not measurebefore-and-after cardiovascular response.

library(caret)
library(randomForest)
library(pROC)

# Train-test split
set.seed(123)

train_index <- createDataPartition(
  model_data$high_bp,
  p = 0.80,
  list = FALSE
)

train_data <- model_data[train_index, ]
test_data  <- model_data[-train_index, ]

# Cross-validation setup
ctrl <- trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE,
  summaryFunction = twoClassSummary
)

# Random Forest model
set.seed(123)

rf_model <- train(
  high_bp ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR +
    DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
  data = train_data,
  method = "rf",
  metric = "ROC",
  trControl = ctrl
)

# Print model
print(rf_model)
## Random Forest 
## 
## 4901 samples
##    9 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 3920, 3920, 3921, 3922, 3921 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##   2     0.8034869  0.8545256  0.4843518
##   5     0.7977187  0.8482403  0.4800458
##   9     0.7911437  0.8425240  0.4693620
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
# Predictions
pred_class <- predict(rf_model, newdata = test_data)
pred_prob  <- predict(rf_model, newdata = test_data, type = "prob")

# Confusion matrix
confusionMatrix(pred_class, test_data$high_bp)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  751 172
##        Yes 123 178
##                                          
##                Accuracy : 0.759          
##                  95% CI : (0.734, 0.7827)
##     No Information Rate : 0.7141         
##     P-Value [Acc > NIR] : 0.0002313      
##                                          
##                   Kappa : 0.384          
##                                          
##  Mcnemar's Test P-Value : 0.0051953      
##                                          
##             Sensitivity : 0.8593         
##             Specificity : 0.5086         
##          Pos Pred Value : 0.8137         
##          Neg Pred Value : 0.5914         
##              Prevalence : 0.7141         
##          Detection Rate : 0.6136         
##    Detection Prevalence : 0.7541         
##       Balanced Accuracy : 0.6839         
##                                          
##        'Positive' Class : No             
## 
# ROC curve
roc_obj <- roc(
  response = test_data$high_bp,
  predictor = pred_prob$Yes
)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
# ROC curve
roc_obj <- roc(
  response = test_data$high_bp,
  predictor = pred_prob$Yes
)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
plot(
  roc_obj,
  main = "ROC Curve: Random Forest High BP Prediction",
  col = "blue",
  lwd = 3,
  legacy.axes = TRUE,
  print.auc = TRUE,
  print.auc.col = "darkgreen",
  print.auc.cex = 1.2,
  grid = TRUE,
  grid.col = "lightgray"
)

abline(
  a = 0,
  b = 1,
  col = "red",
  lwd = 2,
  lty = 2
)

auc(roc_obj)
## Area under the curve: 0.8259

This ROC curve evaluates the performance of the Random Forest model in predicting high blood pressure. The blue curve represents the model’s classification performance across different probability thresholds. The red dashed diagonal line represents random guessing. Since the ROC curve is above the diagonal line, the model performs better than chance. The AUC value summarizes the model’s overall ability to distinguish between participants with and without high blood pressure. A higher AUC indicates stronger predictive performance.

#============ #Question4:Which predictors are most important for predicting high blood pressure: chocolate intake, age, gender, calories, sugar, fat, protein, or carbohydrates? #====================== #This question identifies which factors contribute most to predicting high blood pressure. Based on your regression output, #age was the strongest predictor. Gender and several dietary variables were also significant. Chocolate intake was not #significant in the linear regression, so it may have low importance in the Random Forest model as well.

# Variable importance from Random Forest
importance_result <- varImp(rf_model)

# Print importance values
print(importance_result)
## rf variable importance
## 
##                 Overall
## RIDAGEYR       100.0000
## DR1TCARB        32.3895
## DR1TSUGR        32.1857
## DR1TPROT        31.7185
## DR1TTFAT        31.4268
## DR1TKCAL        31.2701
## RIAGENDRFemale   3.6845
## choc_grams       0.2751
## chocolate1       0.0000

This graph helps us understand the Random Forest model by identifying the most important features used for high blood pressure prediction. The variables at the top or with the longest bars are the strongest predictors. This is useful because it shows which health, demographic, or dietary factors are most strongly associated with high blood pressure in the dataset. In this model, chocolate/cocoa intake is evaluated along with age, gender, calorie intake, sugar, fat, protein, and carbohydrate intake to understand their relative contribution to blood pressure prediction.

#====================== #Question5:Does a multivariable machine learning model improve prediction of high blood pressure compared with a simple chocolate-only model? #===================== #This question compairs simple model vs full model #Simple model: high_bp ~ chocolate + choc_grams #Full model: high_bp ~ chocolate + choc_grams + age + gender + diet variables #This question tests whether adding age, gender, and dietary variables improves prediction compared with using chocolate intake #alone. This is likely to show that the full model performs better because blood pressure is influenced by many factors, especially age and overall diet.

# Simple model with chocolate variables only
set.seed(123)

simple_rf <- train(
  high_bp ~ chocolate + choc_grams,
  data = train_data,
  method = "rf",
  metric = "ROC",
  trControl = ctrl
)
## note: only 1 unique complexity parameters in default grid. Truncating the grid to 1 .
# Full model with chocolate + demographic + dietary variables
set.seed(123)

full_rf <- train(
  high_bp ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR +
    DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
  data = train_data,
  method = "rf",
  metric = "ROC",
  trControl = ctrl
)

# Compare model performance
simple_rf
## Random Forest 
## 
## 4901 samples
##    2 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 3920, 3920, 3921, 3922, 3921 
## Resampling results:
## 
##   ROC        Sens       Spec        
##   0.5004249  0.9988571  0.0007142857
## 
## Tuning parameter 'mtry' was held constant at a value of 2
full_rf
## Random Forest 
## 
## 4901 samples
##    9 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 3920, 3920, 3921, 3922, 3921 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##   2     0.8034869  0.8545256  0.4843518
##   5     0.7977187  0.8482403  0.4800458
##   9     0.7911437  0.8425240  0.4693620
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
# Predict probabilities
simple_prob <- predict(simple_rf, newdata = test_data, type = "prob")
full_prob   <- predict(full_rf, newdata = test_data, type = "prob")

# ROC objects
simple_roc <- roc(test_data$high_bp, simple_prob$Yes)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
full_roc   <- roc(test_data$high_bp, full_prob$Yes)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
# Compare AUC
auc(simple_roc)
## Area under the curve: 0.5046
auc(full_roc)
## Area under the curve: 0.8259

Gray dashed line = Random guessing reference line This ROC comparison graph evaluates whether the full multivariable model performs better than the simple chocolate-only model in predicting high blood pressure. If the red curve is above the blue curve, the full model has better predictive performance because it includes additional health and dietary variables. If the blue and red curves are close together, then adding more variables did not greatly improve prediction.

#=============== #Question6:Can clustering methods identify hidden dietary and cardiovascular risk patterns among NHANES participants? #=============== #This question uses unsupervised learning to identify groups of participants with similar dietary and blood pressure characteristics. For example, clusters may show patterns such as older participants with higher BP, younger participants with lower BP, or higher-calorie dietary patterns. This is more realistic than responder/non-responder clustering becauseNHANES dataset does not measure response before and after cocoa intake.

cluster_data <- model_data %>%
  dplyr::select(
    choc_grams,
    avg_systolic,
    RIDAGEYR,
    DR1TKCAL,
    DR1TSUGR,
    DR1TTFAT,
    DR1TPROT,
    DR1TCARB
  ) %>%
  na.omit()

# Rename age column
cluster_data <- cluster_data %>%
  dplyr::rename(age = RIDAGEYR)

# Remove missing values
cluster_data_clean <- na.omit(cluster_data)

# Scale variables
cluster_scaled <- scale(cluster_data_clean)

# Elbow method
set.seed(123)

wss <- sapply(1:10, function(k) {
  kmeans(
    cluster_scaled,
    centers = k,
    nstart = 25,
    iter.max = 100
  )$tot.withinss
})

# Plot elbow curve
plot(
  1:10,
  wss,
  type = "b",
  xlab = "Number of Clusters",
  ylab = "TotalWithin-Cluster Sum of Squares",
  main = "Elbow Method for Choosing K",
  col = "darkorange",
  pch = 19,
  lwd = 3,
  cex = 1.4
)

grid(
  col = "gray80",
  lty = "dotted"
)

points(
  1:10,
  wss,
  col = "blue",
  pch = 19,
  cex = 1.5
)

This elbow plot helps identify the optimal number of clusters for the data. The point where the decrease in within-cluster sum of squares begins to slow down is selected as the best K value. This helps create meaningful groups without overcomplicating the model.

# Run K-means clustering with 3 clusters
set.seed(123)

kmeans_model <- kmeans(
  cluster_scaled,
  centers = 3,
  nstart = 25
)

# Add cluster labels back to data
cluster_results <- cluster_data %>%
  mutate(cluster = as.factor(kmeans_model$cluster))

cluster_summary <- cluster_results %>%
  dplyr::group_by(cluster) %>%
  dplyr::summarise(
    mean_systolic = mean(avg_systolic, na.rm = TRUE),
    mean_choc_grams = mean(choc_grams, na.rm = TRUE),
    mean_age = mean(age, na.rm = TRUE),
    mean_calories = mean(DR1TKCAL, na.rm = TRUE),
    mean_sugar = mean(DR1TSUGR, na.rm = TRUE),
    mean_fat = mean(DR1TTFAT, na.rm = TRUE),
    mean_protein = mean(DR1TPROT, na.rm = TRUE),
    mean_carb = mean(DR1TCARB, na.rm = TRUE)
  )
cluster_summary
## # A tibble: 3 × 9
##   cluster mean_systolic mean_choc_grams mean_age mean_calories mean_sugar
##   <fct>           <dbl>           <dbl>    <dbl>         <dbl>      <dbl>
## 1 1                108.          0.0667     23.2         1692.       87.7
## 2 2                137.          0.407      62.8         1649.       79.4
## 3 3                121.          0.691      39.8         3448.      187. 
## # ℹ 3 more variables: mean_fat <dbl>, mean_protein <dbl>, mean_carb <dbl>

#Visualize clusters Conclusion:- Chocolate/cocoa intake alone was not a strong predictor of blood pressure. The simple model had a very low AUC of 0.505, almost like random guessing.

The full Random Forest model performed better with an AUC of 0.798 and accuracy of 73.9%, showing that prediction improved when age, gender, calories, sugar, fat, protein, and carbohydrates were included.

Overall, blood pressure was influenced more by age and overall diet pattern than chocolate intake alone. Therefore, chocolate/cocoa intake may be only a small factor in cardiovascular risk prediction.