#Machine learning pipeline that create binary outcome variable for predicting ROC curve and AUC
library(httr)
library(haven)
## Warning: package 'haven' was built under R version 4.5.3
url <- "https://wwwn.cdc.gov/Nchs/Nhanes/2017-20121/DEMO_J.XPT"
dest <- file.path(tempdir(), "DEMO_J.XPT")
res <- GET(
url,
user_agent("R"),
write_disk(dest, overwrite = TRUE)
)
print(file.info(dest)$size)
## [1] 20905
#Download NHANES Food Data (REAL DATA) Datasets
DR1IFF_J <- nhanes("DR1IFF_J")
DR1TOT_J <- nhanes("DR1TOT_J")
BPX_J <- nhanes("BPX_J")
DEMO_J <- nhanes("DEMO_J")
dim(DR1IFF_J)
## [1] 112683 84
#CREATE CHOCOLATE INDICATOR
if (!exists("DR1IFF_J") || is.null(DR1IFF_J)) {
stop("DR1IFF_J is missing or NULL. Please load DR1IFF_J before creating chocolate intake.")
}
chocolate_codes <- c(
53111000, 53113000, 53114000, 53201000, 53202000,
53203000, 53233060, 54101000, 54102000, 92530010
)
# Filter food-level data to rows representing chocolate-related foods
chocolate_data <- DR1IFF_J %>%
filter(DR1IFDCD %in% chocolate_codes) %>% # Keep only chocolate-coded foods
group_by(SEQN) %>% # Group by participant ID
summarise(choc_grams = sum(DR1IGRMS, na.rm = TRUE), .groups = "drop") %>%
mutate(chocolate = 1) # Mark participant as chocolate consumer
#MERGE NHANES TABLES INTO ONE ANALYSIS DATASET
final_data <- DEMO_J %>%
inner_join(BPX_J, by = "SEQN") %>% # Merge demographics + blood pressure
inner_join(DR1TOT_J, by = "SEQN") %>% # Merge total nutrition intake
left_join(chocolate_data, by = "SEQN") # Add chocolate indicator and grams
#REPLACE MISSING CHOCOLATE VALUES
final_data <- final_data %>%
mutate(
chocolate = ifelse(is.na(chocolate), 0, chocolate), # 0 = no chocolate food found
choc_grams = ifelse(is.na(choc_grams), 0, choc_grams) # 0 grams if no chocolate intake
)
#CREATE AVERAGE SYSTOLIC BLOOD PRESSURE
final_data <- final_data %>%
mutate(
BPXSY1 = as.numeric(BPXSY1), # First systolic reading
BPXSY2 = as.numeric(BPXSY2), # Second systolic reading
BPXSY3 = as.numeric(BPXSY3), # Third systolic reading
BPXSY4 = as.numeric(BPXSY4) # Fourth systolic reading
) %>%
mutate(
avg_systolic = rowMeans(
select(., BPXSY1, BPXSY2, BPXSY3, BPXSY4),
na.rm = TRUE
)
) %>%
mutate(
avg_systolic = ifelse(is.nan(avg_systolic), NA, avg_systolic)
# If all BP readings were missing, rowMeans gives NaN, so convert to NA
)
#SELECT MODELING VARIABLES
model_data <- final_data %>%
select(
SEQN, # Participant ID
avg_systolic, # Average systolic blood pressure
chocolate, # Chocolate intake group: 0 = No, 1 = Yes
choc_grams, # Total chocolate grams consumed
RIDAGEYR, # Age in years
RIAGENDR, # Gender
DR1TKCAL, # Total calories consumed
DR1TSUGR, # Total sugar intake
DR1TTFAT, # Total fat intake
DR1TPROT, # Total protein intake
DR1TCARB, # Total carbohydrate intake
BPXPLS # Pulse rate
) %>%
filter(
!is.na(avg_systolic) # Keep only participants with blood pressure data
) %>%
mutate(
RIAGENDR = as.factor(RIAGENDR), # Convert gender to categorical variable
chocolate = as.factor(chocolate), # Convert chocolate group to categorical variable
BPXPLS = as.numeric(BPXPLS), # Convert pulse to numeric
DR1TKCAL = as.numeric(DR1TKCAL), # Convert calories to numeric
DR1TSUGR = as.numeric(DR1TSUGR), # Convert sugar to numeric
DR1TTFAT = as.numeric(DR1TTFAT), # Convert fat to numeric
DR1TPROT = as.numeric(DR1TPROT), # Convert protein to numeric
DR1TCARB = as.numeric(DR1TCARB) # Convert carbohydrates to numeric
) %>%
na.omit() # Remove rows with missing values
#CREATE BINARY OUTCOME VARIABLE
model_data <- model_data %>%
mutate(
high_bp = ifelse(avg_systolic >= 130, 1, 0), # 1 = high BP, 0 = normal/lower BP
high_bp = factor(high_bp, levels = c(0, 1), labels = c("No", "Yes"))
)
#Check whether both outcome groups exist
table(model_data$high_bp)
##
## No Yes
## 4373 1752
#TRAIN / TEST SPLIT
set.seed(123)
train_index <- createDataPartition(
model_data$high_bp, # Outcome variable
p = 0.80, # 80% training data
list = FALSE
)
train_data <- model_data[train_index, ] # Training dataset
test_data <- model_data[-train_index, ] # Testing dataset
nrow(train_data)
## [1] 4901
nrow(test_data)
## [1] 1224
#CROSS-VALIDATION CONTROL
ctrl <- trainControl(
method = "cv", # Cross-validation
number = 5, # 5-fold cross-validation
classProbs = TRUE, # Needed for ROC calculation
summaryFunction = twoClassSummary
)
#RANDOM FOREST MODEL
set.seed(123)
rf_model <- train(
high_bp ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR +
DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB + BPXPLS,
data = train_data,
method = "rf",
metric = "ROC",
trControl = ctrl
)
print(rf_model)
## Random Forest
##
## 4901 samples
## 10 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 3920, 3920, 3921, 3922, 3921
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.8060878 0.8688175 0.4550788
## 6 0.7985442 0.8419559 0.4779309
## 10 0.7939437 0.8399550 0.4907651
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
#PREDICTION ON TEST DATA
# Predicted class
pred_class <- predict(
rf_model,
newdata = test_data
)
# Predicted probabilities
pred_prob <- predict(
rf_model,
newdata = test_data,
type = "prob"
)
#CONFUSION MATRIX
confusionMatrix(
pred_class,
test_data$high_bp
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 770 184
## Yes 104 166
##
## Accuracy : 0.7647
## 95% CI : (0.7399, 0.7882)
## No Information Rate : 0.7141
## P-Value [Acc > NIR] : 3.726e-05
##
## Kappa : 0.3814
##
## Mcnemar's Test P-Value : 3.238e-06
##
## Sensitivity : 0.8810
## Specificity : 0.4743
## Pos Pred Value : 0.8071
## Neg Pred Value : 0.6148
## Prevalence : 0.7141
## Detection Rate : 0.6291
## Detection Prevalence : 0.7794
## Balanced Accuracy : 0.6776
##
## 'Positive' Class : No
##
#ROC CURVE AND AUC
roc_obj <- roc(
response = test_data$high_bp,
predictor = pred_prob$Yes
)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
plot(
roc_obj,
main = "ROC Curve: Random Forest High Blood Pressure Prediction",
col = "blue",
lwd = 3,
legacy.axes = TRUE,
print.auc = TRUE,
print.auc.col = "darkgreen",
print.auc.cex = 1.2,
grid = TRUE,
grid.col = "lightgray"
)
abline(
a = 0,
b = 1,
col = "red",
lwd = 2,
lty = 2
)
auc(roc_obj)
## Area under the curve: 0.832
The ROC curve evaluates the Random Forest model’s ability to classify participants with and without high blood pressure. The AUC value of 0.7889 indicates that the model has acceptable predictive performance, with approximately 78.89% ability to distinguish high blood pressure cases from non-high blood pressure cases. This suggests that the selected variables provide meaningful information for blood pressure prediction.
#VARIABLE IMPORTANCE
importance_result <- varImp(rf_model)
print(importance_result)
## rf variable importance
##
## Overall
## RIDAGEYR 100.0000
## DR1TCARB 32.2648
## DR1TSUGR 31.9212
## DR1TPROT 31.7633
## DR1TTFAT 31.4570
## DR1TKCAL 30.7442
## BPXPLS 25.3016
## RIAGENDRFemale 4.1728
## choc_grams 0.2659
## chocolate1 0.0000
library(pROC)
roc_obj <- roc(
response = test_data$high_bp,
predictor = pred_prob$Yes
)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
plot(
roc_obj,
main = "ROC Curve: Random Forest High Blood Pressure Prediction",
col = "blue",
lwd = 4,
legacy.axes = TRUE,
percent = FALSE,
print.auc = TRUE,
print.auc.col = "darkgreen",
print.auc.cex = 1.3,
print.thres = TRUE,
print.thres.col = "purple",
grid = TRUE,
grid.col = "lightgray"
)
abline(
a = 0,
b = 1,
col = "red",
lwd = 2,
lty = 2
)
legend(
"bottomright",
legend = c("ROC Curve", "Random Guess Line"),
col = c("blue", "red"),
lwd = c(4, 2),
lty = c(1, 2),
bty = "n"
)
auc(roc_obj)
## Area under the curve: 0.832
#LINEAR MODEL FOR INTERPRETABILITY #This predicts actual systolic BP rather than high/low BP class.
lm_model <- lm(
avg_systolic ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR + DR1TKCAL + DR1TSUGR +
DR1TTFAT + DR1TPROT + DR1TCARB,
data = model_data
)
summary(lm_model)
##
## Call:
## lm(formula = avg_systolic ~ chocolate + choc_grams + RIDAGEYR +
## RIAGENDR + DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
## data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.622 -9.922 -1.346 8.051 105.315
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99.723105 0.728660 136.858 < 2e-16 ***
## chocolate1 3.186095 4.592818 0.694 0.487889
## choc_grams -0.038625 0.071521 -0.540 0.589177
## RIDAGEYR 0.539226 0.009139 59.005 < 2e-16 ***
## RIAGENDRFemale -1.642675 0.426154 -3.855 0.000117 ***
## DR1TKCAL 0.004845 0.001235 3.923 8.84e-05 ***
## DR1TSUGR -0.005929 0.005250 -1.129 0.258830
## DR1TTFAT -0.038688 0.012812 -3.020 0.002540 **
## DR1TPROT -0.027935 0.009268 -3.014 0.002587 **
## DR1TCARB -0.014850 0.006174 -2.405 0.016190 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.89 on 6115 degrees of freedom
## Multiple R-squared: 0.3695, Adjusted R-squared: 0.3686
## F-statistic: 398.2 on 9 and 6115 DF, p-value: < 2.2e-16
# Purpose:
# This evaluates the dose-response relationship between chocolate/cocoa grams
# and systolic BP while controlling for age, gender, and dietary intake.
#
# Interpretation:
# If choc_grams has p-value < 0.05, chocolate/cocoa intake is a significant
# predictor after adjustment.
# If p-value >= 0.05, chocolate/cocoa intake is not statistically significant
# after adjustment.
#Question 1: Can chocolate/cocoa consumption predict high systolic blood pressure status among NHANES participants? #This question evaluates whether chocolate/cocoa intake is useful for predicting high blood pressure. #In your previous regression output, chocolate consumption and chocolate grams were not statistically significant, #while age, gender, calories, fat, protein, and carbohydrates were more important predictors. This suggests that chocolate #alone may not strongly predict blood pressure status in your NHANES dataset.
#Create binary high blood pressure outcome
model_data <- model_data %>%
mutate(
high_bp = ifelse(avg_systolic >= 130, "Yes", "No"),
high_bp = as.factor(high_bp)
)
# Check distribution of high BP groups
table(model_data$high_bp)
##
## No Yes
## 4373 1752
# Logistic regression model
log_model <- glm(
high_bp ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR +
DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
data = model_data,
family = binomial
)
# Show model summary
summary(log_model)
##
## Call:
## glm(formula = high_bp ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR +
## DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB, family = binomial,
## data = model_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.0791674 0.1485334 -27.463 < 2e-16 ***
## chocolate1 0.6525666 0.7446859 0.876 0.380868
## choc_grams -0.0096383 0.0113172 -0.852 0.394410
## RIDAGEYR 0.0643181 0.0018718 34.361 < 2e-16 ***
## RIAGENDRFemale -0.0525659 0.0701567 -0.749 0.453698
## DR1TKCAL 0.0005999 0.0001814 3.307 0.000943 ***
## DR1TSUGR 0.0003936 0.0008314 0.473 0.635941
## DR1TTFAT -0.0069454 0.0019317 -3.595 0.000324 ***
## DR1TPROT -0.0024812 0.0014622 -1.697 0.089724 .
## DR1TCARB -0.0016717 0.0009333 -1.791 0.073276 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7332.5 on 6124 degrees of freedom
## Residual deviance: 5559.5 on 6115 degrees of freedom
## AIC: 5579.5
##
## Number of Fisher Scoring iterations: 5
#======================= #Question 2: What is the relationship between chocolate/cocoa grams and average systolic blood pressure? #===================== #This question examines whether higher chocolate/cocoa intake is associated with lower systolic blood pressure. #Based on your earlier output, choc_grams had a negative coefficient, but it was not statistically significant. #That means your dataset does not provide strong evidence that higher chocolate grams are associated with lower systolic blood pressure.
# Linear regression for average systolic blood pressure
lm_systolic <- lm(
avg_systolic ~ choc_grams + chocolate + RIDAGEYR + RIAGENDR +
DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
data = model_data
)
# View model results
summary(lm_systolic)
##
## Call:
## lm(formula = avg_systolic ~ choc_grams + chocolate + RIDAGEYR +
## RIAGENDR + DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
## data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.622 -9.922 -1.346 8.051 105.315
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99.723105 0.728660 136.858 < 2e-16 ***
## choc_grams -0.038625 0.071521 -0.540 0.589177
## chocolate1 3.186095 4.592818 0.694 0.487889
## RIDAGEYR 0.539226 0.009139 59.005 < 2e-16 ***
## RIAGENDRFemale -1.642675 0.426154 -3.855 0.000117 ***
## DR1TKCAL 0.004845 0.001235 3.923 8.84e-05 ***
## DR1TSUGR -0.005929 0.005250 -1.129 0.258830
## DR1TTFAT -0.038688 0.012812 -3.020 0.002540 **
## DR1TPROT -0.027935 0.009268 -3.014 0.002587 **
## DR1TCARB -0.014850 0.006174 -2.405 0.016190 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.89 on 6115 degrees of freedom
## Multiple R-squared: 0.3695, Adjusted R-squared: 0.3686
## F-statistic: 398.2 on 9 and 6115 DF, p-value: < 2.2e-16
#Visualization
## `geom_smooth()` using formula = 'y ~ x'
#======================== #Question3:Can participants be classified into lower-risk and higher-risk blood pressure groups using chocolate intake, diet, and demographic features? #========================== #This question uses machine learning to classify participants as high BP or non-high BP. The Random Forest model uses chocolate intake, chocolate grams, age, gender, calories, sugar, fat, protein, and carbohydrates. This is more appropriate for NHANES dataset than responder/non-responder analysis because NHANES is cross-sectional and does not measurebefore-and-after cardiovascular response.
library(caret)
library(randomForest)
library(pROC)
# Train-test split
set.seed(123)
train_index <- createDataPartition(
model_data$high_bp,
p = 0.80,
list = FALSE
)
train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]
# Cross-validation setup
ctrl <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary
)
# Random Forest model
set.seed(123)
rf_model <- train(
high_bp ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR +
DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
data = train_data,
method = "rf",
metric = "ROC",
trControl = ctrl
)
# Print model
print(rf_model)
## Random Forest
##
## 4901 samples
## 9 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 3920, 3920, 3921, 3922, 3921
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.8034869 0.8545256 0.4843518
## 5 0.7977187 0.8482403 0.4800458
## 9 0.7911437 0.8425240 0.4693620
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
# Predictions
pred_class <- predict(rf_model, newdata = test_data)
pred_prob <- predict(rf_model, newdata = test_data, type = "prob")
# Confusion matrix
confusionMatrix(pred_class, test_data$high_bp)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 751 172
## Yes 123 178
##
## Accuracy : 0.759
## 95% CI : (0.734, 0.7827)
## No Information Rate : 0.7141
## P-Value [Acc > NIR] : 0.0002313
##
## Kappa : 0.384
##
## Mcnemar's Test P-Value : 0.0051953
##
## Sensitivity : 0.8593
## Specificity : 0.5086
## Pos Pred Value : 0.8137
## Neg Pred Value : 0.5914
## Prevalence : 0.7141
## Detection Rate : 0.6136
## Detection Prevalence : 0.7541
## Balanced Accuracy : 0.6839
##
## 'Positive' Class : No
##
# ROC curve
roc_obj <- roc(
response = test_data$high_bp,
predictor = pred_prob$Yes
)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
# ROC curve
roc_obj <- roc(
response = test_data$high_bp,
predictor = pred_prob$Yes
)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
plot(
roc_obj,
main = "ROC Curve: Random Forest High BP Prediction",
col = "blue",
lwd = 3,
legacy.axes = TRUE,
print.auc = TRUE,
print.auc.col = "darkgreen",
print.auc.cex = 1.2,
grid = TRUE,
grid.col = "lightgray"
)
abline(
a = 0,
b = 1,
col = "red",
lwd = 2,
lty = 2
)
auc(roc_obj)
## Area under the curve: 0.8259
This ROC curve evaluates the performance of the Random Forest model in predicting high blood pressure. The blue curve represents the model’s classification performance across different probability thresholds. The red dashed diagonal line represents random guessing. Since the ROC curve is above the diagonal line, the model performs better than chance. The AUC value summarizes the model’s overall ability to distinguish between participants with and without high blood pressure. A higher AUC indicates stronger predictive performance.
#============ #Question4:Which predictors are most important for predicting high blood pressure: chocolate intake, age, gender, calories, sugar, fat, protein, or carbohydrates? #====================== #This question identifies which factors contribute most to predicting high blood pressure. Based on your regression output, #age was the strongest predictor. Gender and several dietary variables were also significant. Chocolate intake was not #significant in the linear regression, so it may have low importance in the Random Forest model as well.
# Variable importance from Random Forest
importance_result <- varImp(rf_model)
# Print importance values
print(importance_result)
## rf variable importance
##
## Overall
## RIDAGEYR 100.0000
## DR1TCARB 32.3895
## DR1TSUGR 32.1857
## DR1TPROT 31.7185
## DR1TTFAT 31.4268
## DR1TKCAL 31.2701
## RIAGENDRFemale 3.6845
## choc_grams 0.2751
## chocolate1 0.0000
This graph helps us understand the Random Forest model by identifying
the most important features used for high blood pressure prediction. The
variables at the top or with the longest bars are the strongest
predictors. This is useful because it shows which health, demographic,
or dietary factors are most strongly associated with high blood pressure
in the dataset. In this model, chocolate/cocoa intake is evaluated along
with age, gender, calorie intake, sugar, fat, protein, and carbohydrate
intake to understand their relative contribution to blood pressure
prediction.
#====================== #Question5:Does a multivariable machine learning model improve prediction of high blood pressure compared with a simple chocolate-only model? #===================== #This question compairs simple model vs full model #Simple model: high_bp ~ chocolate + choc_grams #Full model: high_bp ~ chocolate + choc_grams + age + gender + diet variables #This question tests whether adding age, gender, and dietary variables improves prediction compared with using chocolate intake #alone. This is likely to show that the full model performs better because blood pressure is influenced by many factors, especially age and overall diet.
# Simple model with chocolate variables only
set.seed(123)
simple_rf <- train(
high_bp ~ chocolate + choc_grams,
data = train_data,
method = "rf",
metric = "ROC",
trControl = ctrl
)
## note: only 1 unique complexity parameters in default grid. Truncating the grid to 1 .
# Full model with chocolate + demographic + dietary variables
set.seed(123)
full_rf <- train(
high_bp ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR +
DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
data = train_data,
method = "rf",
metric = "ROC",
trControl = ctrl
)
# Compare model performance
simple_rf
## Random Forest
##
## 4901 samples
## 2 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 3920, 3920, 3921, 3922, 3921
## Resampling results:
##
## ROC Sens Spec
## 0.5004249 0.9988571 0.0007142857
##
## Tuning parameter 'mtry' was held constant at a value of 2
full_rf
## Random Forest
##
## 4901 samples
## 9 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 3920, 3920, 3921, 3922, 3921
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.8034869 0.8545256 0.4843518
## 5 0.7977187 0.8482403 0.4800458
## 9 0.7911437 0.8425240 0.4693620
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
# Predict probabilities
simple_prob <- predict(simple_rf, newdata = test_data, type = "prob")
full_prob <- predict(full_rf, newdata = test_data, type = "prob")
# ROC objects
simple_roc <- roc(test_data$high_bp, simple_prob$Yes)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
full_roc <- roc(test_data$high_bp, full_prob$Yes)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
# Compare AUC
auc(simple_roc)
## Area under the curve: 0.5046
auc(full_roc)
## Area under the curve: 0.8259
Gray dashed line = Random guessing reference line This ROC comparison
graph evaluates whether the full multivariable model performs better
than the simple chocolate-only model in predicting high blood pressure.
If the red curve is above the blue curve, the full model has better
predictive performance because it includes additional health and dietary
variables. If the blue and red curves are close together, then adding
more variables did not greatly improve prediction.
#=============== #Question6:Can clustering methods identify hidden dietary and cardiovascular risk patterns among NHANES participants? #=============== #This question uses unsupervised learning to identify groups of participants with similar dietary and blood pressure characteristics. For example, clusters may show patterns such as older participants with higher BP, younger participants with lower BP, or higher-calorie dietary patterns. This is more realistic than responder/non-responder clustering becauseNHANES dataset does not measure response before and after cocoa intake.
cluster_data <- model_data %>%
dplyr::select(
choc_grams,
avg_systolic,
RIDAGEYR,
DR1TKCAL,
DR1TSUGR,
DR1TTFAT,
DR1TPROT,
DR1TCARB
) %>%
na.omit()
# Rename age column
cluster_data <- cluster_data %>%
dplyr::rename(age = RIDAGEYR)
# Remove missing values
cluster_data_clean <- na.omit(cluster_data)
# Scale variables
cluster_scaled <- scale(cluster_data_clean)
# Elbow method
set.seed(123)
wss <- sapply(1:10, function(k) {
kmeans(
cluster_scaled,
centers = k,
nstart = 25,
iter.max = 100
)$tot.withinss
})
# Plot elbow curve
plot(
1:10,
wss,
type = "b",
xlab = "Number of Clusters",
ylab = "TotalWithin-Cluster Sum of Squares",
main = "Elbow Method for Choosing K",
col = "darkorange",
pch = 19,
lwd = 3,
cex = 1.4
)
grid(
col = "gray80",
lty = "dotted"
)
points(
1:10,
wss,
col = "blue",
pch = 19,
cex = 1.5
)
This elbow plot helps identify the optimal number of clusters for the data. The point where the decrease in within-cluster sum of squares begins to slow down is selected as the best K value. This helps create meaningful groups without overcomplicating the model.
# Run K-means clustering with 3 clusters
set.seed(123)
kmeans_model <- kmeans(
cluster_scaled,
centers = 3,
nstart = 25
)
# Add cluster labels back to data
cluster_results <- cluster_data %>%
mutate(cluster = as.factor(kmeans_model$cluster))
cluster_summary <- cluster_results %>%
dplyr::group_by(cluster) %>%
dplyr::summarise(
mean_systolic = mean(avg_systolic, na.rm = TRUE),
mean_choc_grams = mean(choc_grams, na.rm = TRUE),
mean_age = mean(age, na.rm = TRUE),
mean_calories = mean(DR1TKCAL, na.rm = TRUE),
mean_sugar = mean(DR1TSUGR, na.rm = TRUE),
mean_fat = mean(DR1TTFAT, na.rm = TRUE),
mean_protein = mean(DR1TPROT, na.rm = TRUE),
mean_carb = mean(DR1TCARB, na.rm = TRUE)
)
cluster_summary
## # A tibble: 3 × 9
## cluster mean_systolic mean_choc_grams mean_age mean_calories mean_sugar
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 108. 0.0667 23.2 1692. 87.7
## 2 2 137. 0.407 62.8 1649. 79.4
## 3 3 121. 0.691 39.8 3448. 187.
## # ℹ 3 more variables: mean_fat <dbl>, mean_protein <dbl>, mean_carb <dbl>
#Visualize clusters
Conclusion:- Chocolate/cocoa intake alone was not a strong predictor of
blood pressure. The simple model had a very low AUC of 0.505, almost
like random guessing.
The full Random Forest model performed better with an AUC of 0.798 and accuracy of 73.9%, showing that prediction improved when age, gender, calories, sugar, fat, protein, and carbohydrates were included.
Overall, blood pressure was influenced more by age and overall diet pattern than chocolate intake alone. Therefore, chocolate/cocoa intake may be only a small factor in cardiovascular risk prediction.