#STATISTICAL ANALYSIS SECTION #Purpose:To validate clinical trends before applying machine learning. #This includes: #1. Descriptive statistics #2. Group comparison using t-test / ANOVA #3. Correlation analysis #4. Regression analysis for dose-response relationship
library(httr)
library(haven)
## Warning: package 'haven' was built under R version 4.5.3
url <- "https://wwwn.cdc.gov/Nchs/Nhanes/2017-20121/DEMO_J.XPT"
dest <- file.path(tempdir(), "DEMO_J.XPT")
res <- GET(
url,
user_agent("R"),
write_disk(dest, overwrite = TRUE)
)
print(file.info(dest)$size)
## [1] 20905
#1. LOAD REQUIRED LIBRARIES
library(dplyr) # Used for data cleaning, grouping, and summarizing
## Warning: package 'dplyr' was built under R version 4.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) # Used for creating visualizations
## Warning: package 'ggplot2' was built under R version 4.5.3
library(psych) # Used for descriptive statistics
## Warning: package 'psych' was built under R version 4.5.3
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(broom) # Used to convert model results into clean tables
## Warning: package 'broom' was built under R version 4.5.3
library(nhanesA) # Download NHANES datasets directly in R
## Warning: package 'nhanesA' was built under R version 4.5.3
library(dplyr) # Data manipulation
library(caret) # Train/test split + preprocessing + ML utilities
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:httr':
##
## progress
library(randomForest) # Random Forest model
## Warning: package 'randomForest' was built under R version 4.5.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(pROC) # ROC and AUC evaluation
## Warning: package 'pROC' was built under R version 4.5.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
#2. DOWNLOAD NHANES DATA
DEMO_J <- nhanes("DEMO_J") # Demographic data
BPX_J <- nhanes("BPX_J") # Blood pressure examination data
DR1TOT_J <- nhanes("DR1TOT_J") # Day 1 total nutrient intake
DR1IFF_J <- nhanes("DR1IFF_J") # Day 1 individual foods file
dim(DR1IFF_J)
## [1] 112683 84
#4. CREATE CHOCOLATE INDICATOR
chocolate_codes <- c(
53111000, 53113000, 53114000, 53201000, 53202000,
53203000, 53233060, 54101000, 54102000, 92530010
)
# Filter food-level data to rows representing chocolate-related foods
chocolate_data <- DR1IFF_J %>%
filter(DR1IFDCD %in% chocolate_codes) %>% # Keep only chocolate-coded foods
group_by(SEQN) %>% # Group by participant ID
summarise(choc_grams = sum(DR1IGRMS, na.rm = TRUE), .groups = "drop") %>%
mutate(chocolate = 1) # Mark participant as chocolate consumer
#5. MERGE NHANES TABLES INTO ONE ANALYSIS DATASET
final_data <- DEMO_J %>%
inner_join(BPX_J, by = "SEQN") %>% # Merge demographics + blood pressure
inner_join(DR1TOT_J, by = "SEQN") %>% # Merge total nutrition intake
left_join(chocolate_data, by = "SEQN") # Add chocolate indicator and grams
#6. REPLACE MISSING CHOCOLATE VALUES
final_data <- final_data %>%
mutate(
chocolate = ifelse(is.na(chocolate), 0, chocolate), # 0 = no chocolate food found
choc_grams = ifelse(is.na(choc_grams), 0, choc_grams) # 0 grams if no chocolate intake
)
#7.Create Systolic and Diastolic BP Outcomes
final_data <- final_data %>%
mutate(
# Convert systolic BP readings to numeric
BPXSY1 = as.numeric(BPXSY1),
BPXSY2 = as.numeric(BPXSY2),
BPXSY3 = as.numeric(BPXSY3),
BPXSY4 = as.numeric(BPXSY4),
# Convert diastolic BP readings to numeric
BPXDI1 = as.numeric(BPXDI1),
BPXDI2 = as.numeric(BPXDI2),
BPXDI3 = as.numeric(BPXDI3),
BPXDI4 = as.numeric(BPXDI4),
# Average systolic BP using available readings
avg_systolic = rowMeans(
select(., BPXSY1, BPXSY2, BPXSY3, BPXSY4),
na.rm = TRUE
),
# Average diastolic BP using available readings
avg_diastolic = rowMeans(
select(., BPXDI1, BPXDI2, BPXDI3, BPXDI4),
na.rm = TRUE
),
# Replace NaN with NA if all readings were missing
avg_systolic = ifelse(is.nan(avg_systolic), NA, avg_systolic),
avg_diastolic = ifelse(is.nan(avg_diastolic), NA, avg_diastolic)
)
#8. Prepare Analysis Dataset
analysis_data <- final_data %>%
select(
SEQN, # Participant ID
avg_systolic, # Average systolic blood pressure
avg_diastolic, # Average diastolic blood pressure
chocolate, # Chocolate/cocoa consumer: 0 = No, 1 = Yes
choc_grams, # Chocolate/cocoa intake in grams
RIDAGEYR, # Age
RIAGENDR, # Gender
DR1TKCAL, # Total calories
DR1TSUGR, # Total sugar
DR1TTFAT, # Total fat
DR1TPROT, # Total protein
DR1TCARB # Total carbohydrates
) %>%
filter(
!is.na(avg_systolic),
!is.na(avg_diastolic)
) %>%
mutate(
chocolate = factor(chocolate, levels = c(0, 1), labels = c("No", "Yes")),
RIAGENDR = as.factor(RIAGENDR),
choc_grams = as.numeric(choc_grams),
RIDAGEYR = as.numeric(RIDAGEYR),
DR1TKCAL = as.numeric(DR1TKCAL),
DR1TSUGR = as.numeric(DR1TSUGR),
DR1TTFAT = as.numeric(DR1TTFAT),
DR1TPROT = as.numeric(DR1TPROT),
DR1TCARB = as.numeric(DR1TCARB)
) %>%
na.omit()
#9. Descriptive Statistics
descriptive_table <- analysis_data %>%
summarise(
N = n(),
mean_age = mean(RIDAGEYR),
sd_age = sd(RIDAGEYR),
mean_systolic = mean(avg_systolic),
sd_systolic = sd(avg_systolic),
mean_diastolic = mean(avg_diastolic),
sd_diastolic = sd(avg_diastolic),
mean_choc_grams = mean(choc_grams),
sd_choc_grams = sd(choc_grams),
mean_calories = mean(DR1TKCAL),
sd_calories = sd(DR1TKCAL),
mean_sugar = mean(DR1TSUGR),
sd_sugar = sd(DR1TSUGR),
mean_fat = mean(DR1TTFAT),
sd_fat = sd(DR1TTFAT)
)
print(descriptive_table)
## N mean_age sd_age mean_systolic sd_systolic mean_diastolic sd_diastolic
## 1 6125 41.7742 22.39373 121.772 19.99214 68.74623 15.31413
## mean_choc_grams sd_choc_grams mean_calories sd_calories mean_sugar sd_sugar
## 1 0.3401469 5.111966 2090.101 1006.304 108.046 76.29649
## mean_fat sd_fat
## 1 84.2676 48.80818
#10. Group Comparison: Chocolate vs No Chocolate This is your closest version of high-flavanol vs low-flavanol comparison.
# GROUP SUMMARY BY CHOCOLATE INTAKE
group_summary <- analysis_data %>%
group_by(chocolate) %>%
summarise(
n = n(),
mean_systolic = mean(avg_systolic),
sd_systolic = sd(avg_systolic),
mean_diastolic = mean(avg_diastolic),
sd_diastolic = sd(avg_diastolic),
mean_age = mean(RIDAGEYR),
sd_age = sd(RIDAGEYR),
mean_choc_grams = mean(choc_grams),
sd_choc_grams = sd(choc_grams),
.groups = "drop"
)
print(group_summary)
## # A tibble: 2 × 10
## chocolate n mean_systolic sd_systolic mean_diastolic sd_diastolic mean_age
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 No 6086 122. 19.9 68.8 15.3 41.7
## 2 Yes 39 126. 26.4 67.8 17.2 47.5
## # ℹ 3 more variables: sd_age <dbl>, mean_choc_grams <dbl>, sd_choc_grams <dbl>
#11. T-TEST: SYSTOLIC BP BY CHOCOLATE GROUP
t_sbp <- t.test(
avg_systolic ~ chocolate,
data = analysis_data
)
print(t_sbp)
##
## Welch Two Sample t-test
##
## data: avg_systolic by chocolate
## t = -1.0184, df = 38.279, p-value = 0.3149
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
## -12.865632 4.252169
## sample estimates:
## mean in group No mean in group Yes
## 121.7446 126.0513
#12. T-TEST: DIASTOLIC BP BY CHOCOLATE GROUP
t_dbp <- t.test(
avg_diastolic ~ chocolate,
data = analysis_data
)
print(t_dbp)
##
## Welch Two Sample t-test
##
## data: avg_diastolic by chocolate
## t = 0.32761, df = 38.385, p-value = 0.745
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
## -4.689784 6.501474
## sample estimates:
## mean in group No mean in group Yes
## 68.75200 67.84615
Interpretation: If p-value is less than 0.05, average BP is significantly different between chocolate consumers and non-consumers. If p-value is greater than or equal to 0.05, there is no significant difference.
#13. Create Chocolate Intake Levels #This is similar to comparing low vs high flavanol groups, but based on grams.
# CREATE CHOCOLATE INTAKE LEVELS
analysis_data <- analysis_data %>%
mutate(
chocolate_level = case_when(
choc_grams == 0 ~ "None",
choc_grams > 0 & choc_grams <= 25 ~ "Low",
choc_grams > 25 & choc_grams <= 50 ~ "Moderate",
choc_grams > 50 ~ "High"
),
chocolate_level = factor(
chocolate_level,
levels = c("None", "Low", "Moderate", "High")
)
)
table(analysis_data$chocolate_level)
##
## None Low Moderate High
## 6086 10 10 19
#14. ANOVA: Compare BP Across Chocolate Intake Levels
# ANOVA: SYSTOLIC BP ACROSS CHOCOLATE LEVELS
anova_sbp <- aov(
avg_systolic ~ chocolate_level,
data = analysis_data
)
summary(anova_sbp)
## Df Sum Sq Mean Sq F value Pr(>F)
## chocolate_level 3 2657 885.5 2.217 0.084 .
## Residuals 6121 2445019 399.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#15. POST-HOC TEST IF ANOVA IS SIGNIFICANT
TukeyHSD(anova_sbp)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = avg_systolic ~ chocolate_level, data = analysis_data)
##
## $chocolate_level
## diff lwr upr p adj
## Low-None -6.811217 -23.065777 9.443343 0.7037382
## Moderate-None 12.388783 -3.865777 28.643343 0.2039826
## High-None 5.904572 -5.896427 17.705571 0.5720521
## Moderate-Low 19.200000 -3.768557 42.168557 0.1381931
## High-Low 12.715789 -7.349291 32.780870 0.3625473
## High-Moderate -6.484211 -26.549291 13.580870 0.8400315
# ANOVA: DIASTOLIC BP ACROSS CHOCOLATE LEVELS
anova_dbp <- aov(
avg_diastolic ~ chocolate_level,
data = analysis_data
)
summary(anova_dbp)
## Df Sum Sq Mean Sq F value Pr(>F)
## chocolate_level 3 907 302.3 1.289 0.276
## Residuals 6121 1435308 234.5
#POST-HOC TEST FOR DIASTOLIC BP
TukeyHSD(anova_dbp)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = avg_diastolic ~ chocolate_level, data = analysis_data)
##
## $chocolate_level
## diff lwr upr p adj
## Low-None -4.485332 -16.939271 7.968606 0.7911925
## Moderate-None 7.114668 -5.339271 19.568606 0.4570110
## High-None -3.243227 -12.284931 5.798476 0.7932178
## Moderate-Low 11.600000 -5.998076 29.198076 0.3269420
## High-Low 1.242105 -14.131382 16.615593 0.9968365
## High-Moderate -10.357895 -25.731382 5.015593 0.3073977
#16. Correlation Analysis #This checks whether higher chocolate/cocoa grams are associated with BP.
# CORRELATION: CHOCOLATE GRAMS VS SYSTOLIC BP
cor_sbp <- cor.test(
analysis_data$choc_grams,
analysis_data$avg_systolic,
method = "pearson"
)
print(cor_sbp)
##
## Pearson's product-moment correlation
##
## data: analysis_data$choc_grams and analysis_data$avg_systolic
## t = 1.6395, df = 6123, p-value = 0.1012
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.004098458 0.045968372
## sample estimates:
## cor
## 0.02094809
# CORRELATION: CHOCOLATE GRAMS VS DIASTOLIC BP
cor_dbp <- cor.test(
analysis_data$choc_grams,
analysis_data$avg_diastolic,
method = "pearson"
)
print(cor_dbp)
##
## Pearson's product-moment correlation
##
## data: analysis_data$choc_grams and analysis_data$avg_diastolic
## t = -0.45725, df = 6123, p-value = 0.6475
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03088329 0.01920379
## sample estimates:
## cor
## -0.005843415
#17. Regression Analysis Like Adjusted Paper-Style Analysis
#The paper used repeated-measures ANOVA because it had intervention and time. our dataset does not have repeated time points, so We use adjusted linear regression.
# MULTIVARIABLE LINEAR REGRESSION: SYSTOLIC BP
lm_sbp <- lm(
avg_systolic ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR +
DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
data = analysis_data
)
summary(lm_sbp)
##
## Call:
## lm(formula = avg_systolic ~ chocolate + choc_grams + RIDAGEYR +
## RIAGENDR + DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
## data = analysis_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.622 -9.922 -1.346 8.051 105.315
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99.723105 0.728660 136.858 < 2e-16 ***
## chocolateYes 3.186095 4.592818 0.694 0.487889
## choc_grams -0.038625 0.071521 -0.540 0.589177
## RIDAGEYR 0.539226 0.009139 59.005 < 2e-16 ***
## RIAGENDRFemale -1.642675 0.426154 -3.855 0.000117 ***
## DR1TKCAL 0.004845 0.001235 3.923 8.84e-05 ***
## DR1TSUGR -0.005929 0.005250 -1.129 0.258830
## DR1TTFAT -0.038688 0.012812 -3.020 0.002540 **
## DR1TPROT -0.027935 0.009268 -3.014 0.002587 **
## DR1TCARB -0.014850 0.006174 -2.405 0.016190 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.89 on 6115 degrees of freedom
## Multiple R-squared: 0.3695, Adjusted R-squared: 0.3686
## F-statistic: 398.2 on 9 and 6115 DF, p-value: < 2.2e-16
# MULTIVARIABLE LINEAR REGRESSION: DIASTOLIC BP
lm_dbp <- lm(
avg_diastolic ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR +
DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
data = analysis_data
)
summary(lm_dbp)
##
## Call:
## lm(formula = avg_diastolic ~ chocolate + choc_grams + RIDAGEYR +
## RIAGENDR + DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
## data = analysis_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -77.570 -6.920 1.156 8.531 53.412
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.304412 0.660431 88.282 < 2e-16 ***
## chocolateYes 2.058550 4.162767 0.495 0.62096
## choc_grams -0.079371 0.064824 -1.224 0.22084
## RIDAGEYR 0.217321 0.008283 26.237 < 2e-16 ***
## RIAGENDRFemale -1.245217 0.386251 -3.224 0.00127 **
## DR1TKCAL 0.007182 0.001119 6.416 1.51e-10 ***
## DR1TSUGR 0.002613 0.004759 0.549 0.58296
## DR1TTFAT -0.064129 0.011612 -5.523 3.48e-08 ***
## DR1TPROT -0.007184 0.008400 -0.855 0.39245
## DR1TCARB -0.029302 0.005596 -5.236 1.69e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.4 on 6115 degrees of freedom
## Multiple R-squared: 0.1173, Adjusted R-squared: 0.116
## F-statistic: 90.26 on 9 and 6115 DF, p-value: < 2.2e-16
#18.Boxplot: Systolic BP by Chocolate Level
#This graph helps compare whether people with different chocolate/cocoa intake levels have different average systolic blood pressure.
ggplot(analysis_data, aes(x = chocolate_level, y = avg_systolic, fill = chocolate_level)) +
geom_boxplot(alpha = 0.7) +
geom_jitter(width = 0.15, alpha = 0.3, size = 1) +
labs(
title = "Average Systolic Blood Pressure by Chocolate/Cocoa Intake Level",
x = "Chocolate/Cocoa Intake Level",
y = "Average Systolic Blood Pressure"
) +
scale_fill_brewer(palette = "Set2") +
theme_minimal() +
theme(
legend.position = "none"
)
Interpretation :- The boxplot compares average systolic blood pressure
across chocolate/cocoa intake groups. If the median line is lower in
higher chocolate intake groups, it may suggest that higher
chocolate/cocoa intake is associated with lower systolic blood pressure.
However, if the boxes overlap strongly, the difference between groups
may be small and should be confirmed using statistical tests such as
ANOVA or regression.
#19. Boxplot: Diastolic BP by Chocolate Level
ggplot(analysis_data, aes(x = chocolate_level, y = avg_diastolic, fill = chocolate_level)) +
geom_boxplot(alpha = 0.7) +
geom_jitter(width = 0.15, alpha = 0.3, size = 1) +
labs(
title = "Average Diastolic Blood Pressure by Chocolate/Cocoa Intake Level",
x = "Chocolate/Cocoa Intake Level",
y = "Average Diastolic Blood Pressure"
) +
scale_fill_brewer(palette = "Set2") +
theme_minimal() +
theme(
legend.position = "none"
)
Interpretation : - This boxplot compares average diastolic blood
pressure across chocolate/cocoa intake levels. The dots represent
individual participants. The median diastolic BP appears fairly similar
across groups, with the moderate intake group slightly higher, but
statistical testing such as ANOVA or regression is needed to confirm
whether the differences are meaningful.
#20. Scatterplot: Chocolate Grams vs Systolic BP
ggplot(analysis_data, aes(x = choc_grams, y = avg_systolic)) +
geom_point(color = "steelblue", alpha = 0.4, size = 2) +
geom_smooth(method = "lm", se = TRUE, color = "red", fill = "pink") +
labs(
title = "Chocolate/Cocoa Grams and Average Systolic Blood Pressure",
x = "Chocolate/Cocoa Intake in Grams",
y = "Average Systolic Blood Pressure"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Interpretation :- This scatter plot shows a weak positive relationship
between chocolate/cocoa intake in grams and average systolic blood
pressure. Most participants reported no chocolate/cocoa intake, and only
a small number had higher intake levels. Therefore, the relationship is
weak and should be confirmed using regression analysis and statistical
significance testing.
#Final Interpretation:- The original cosmos study used a randomized, double-blind, placebo-controlled cross-over design to compare high-flavanol and low-flavanol cocoa interventions during and after an acute mental stress task. Outcomes included flow-mediated dilation, forearm blood flow, systolic blood pressure, and diastolic blood pressure, analyzed using two-way repeated measures ANOVA with intervention and time as within-subject factors.
In contrast, the NHANES dataset used in the present study is observational and cross-sectional. Therefore, repeated-measures intervention analysis could not be directly replicated. Instead, chocolate/cocoa intake was used as the exposure variable, and systolic and diastolic blood pressure were used as cardiovascular outcomes. Descriptive statistics, t-tests, ANOVA, correlation analysis, and multivariable linear regression were used to evaluate associations between chocolate/cocoa intake and blood pressure while adjusting for age, gender, and dietary intake.