#STATISTICAL ANALYSIS SECTION #Purpose:To validate clinical trends before applying machine learning. #This includes: #1. Descriptive statistics #2. Group comparison using t-test / ANOVA #3. Correlation analysis #4. Regression analysis for dose-response relationship

library(httr)
library(haven)
## Warning: package 'haven' was built under R version 4.5.3
url <- "https://wwwn.cdc.gov/Nchs/Nhanes/2017-20121/DEMO_J.XPT"
dest <- file.path(tempdir(), "DEMO_J.XPT")

res <- GET(
  url,
  user_agent("R"),
  write_disk(dest, overwrite = TRUE)
)

print(file.info(dest)$size)
## [1] 20905

#1. LOAD REQUIRED LIBRARIES

library(dplyr)      # Used for data cleaning, grouping, and summarizing
## Warning: package 'dplyr' was built under R version 4.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)    # Used for creating visualizations
## Warning: package 'ggplot2' was built under R version 4.5.3
library(psych)      # Used for descriptive statistics
## Warning: package 'psych' was built under R version 4.5.3
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(broom)      # Used to convert model results into clean tables
## Warning: package 'broom' was built under R version 4.5.3
library(nhanesA)      # Download NHANES datasets directly in R
## Warning: package 'nhanesA' was built under R version 4.5.3
library(dplyr)        # Data manipulation
library(caret)        # Train/test split + preprocessing + ML utilities
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:httr':
## 
##     progress
library(randomForest) # Random Forest model
## Warning: package 'randomForest' was built under R version 4.5.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
## 
##     outlier
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(pROC)         # ROC and AUC evaluation
## Warning: package 'pROC' was built under R version 4.5.3
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

#2. DOWNLOAD NHANES DATA

DEMO_J   <- nhanes("DEMO_J")    # Demographic data
BPX_J    <- nhanes("BPX_J")     # Blood pressure examination data
DR1TOT_J <- nhanes("DR1TOT_J")  # Day 1 total nutrient intake
DR1IFF_J <- nhanes("DR1IFF_J")  # Day 1 individual foods file

dim(DR1IFF_J)
## [1] 112683     84

#4. CREATE CHOCOLATE INDICATOR

chocolate_codes <- c(
  53111000, 53113000, 53114000, 53201000, 53202000,
  53203000, 53233060, 54101000, 54102000, 92530010
)

# Filter food-level data to rows representing chocolate-related foods
chocolate_data <- DR1IFF_J %>%
  filter(DR1IFDCD %in% chocolate_codes) %>%   # Keep only chocolate-coded foods
  group_by(SEQN) %>%                          # Group by participant ID
  summarise(choc_grams = sum(DR1IGRMS, na.rm = TRUE), .groups = "drop") %>% 
  mutate(chocolate = 1)                       # Mark participant as chocolate consumer

#5. MERGE NHANES TABLES INTO ONE ANALYSIS DATASET

final_data <- DEMO_J %>%
  inner_join(BPX_J, by = "SEQN") %>%          # Merge demographics + blood pressure
  inner_join(DR1TOT_J, by = "SEQN") %>%       # Merge total nutrition intake
  left_join(chocolate_data, by = "SEQN")      # Add chocolate indicator and grams

#6. REPLACE MISSING CHOCOLATE VALUES

final_data <- final_data %>%
  mutate(
    chocolate  = ifelse(is.na(chocolate), 0, chocolate),    # 0 = no chocolate food found
    choc_grams = ifelse(is.na(choc_grams), 0, choc_grams)   # 0 grams if no chocolate intake
  )

#7.Create Systolic and Diastolic BP Outcomes

final_data <- final_data %>%
  mutate(
    # Convert systolic BP readings to numeric
    BPXSY1 = as.numeric(BPXSY1),
    BPXSY2 = as.numeric(BPXSY2),
    BPXSY3 = as.numeric(BPXSY3),
    BPXSY4 = as.numeric(BPXSY4),
    
    # Convert diastolic BP readings to numeric
    BPXDI1 = as.numeric(BPXDI1),
    BPXDI2 = as.numeric(BPXDI2),
    BPXDI3 = as.numeric(BPXDI3),
    BPXDI4 = as.numeric(BPXDI4),
    
    # Average systolic BP using available readings
    avg_systolic = rowMeans(
      select(., BPXSY1, BPXSY2, BPXSY3, BPXSY4),
      na.rm = TRUE
    ),
    
    # Average diastolic BP using available readings
    avg_diastolic = rowMeans(
      select(., BPXDI1, BPXDI2, BPXDI3, BPXDI4),
      na.rm = TRUE
    ),
    
    # Replace NaN with NA if all readings were missing
    avg_systolic = ifelse(is.nan(avg_systolic), NA, avg_systolic),
    avg_diastolic = ifelse(is.nan(avg_diastolic), NA, avg_diastolic)
  )

#8. Prepare Analysis Dataset

analysis_data <- final_data %>%
  select(
    SEQN,            # Participant ID
    avg_systolic,    # Average systolic blood pressure
    avg_diastolic,   # Average diastolic blood pressure
    chocolate,       # Chocolate/cocoa consumer: 0 = No, 1 = Yes
    choc_grams,      # Chocolate/cocoa intake in grams
    RIDAGEYR,        # Age
    RIAGENDR,        # Gender
    DR1TKCAL,        # Total calories
    DR1TSUGR,        # Total sugar
    DR1TTFAT,        # Total fat
    DR1TPROT,        # Total protein
    DR1TCARB         # Total carbohydrates
  ) %>%
  filter(
    !is.na(avg_systolic),
    !is.na(avg_diastolic)
  ) %>%
  mutate(
    chocolate = factor(chocolate, levels = c(0, 1), labels = c("No", "Yes")),
    RIAGENDR = as.factor(RIAGENDR),
    choc_grams = as.numeric(choc_grams),
    RIDAGEYR = as.numeric(RIDAGEYR),
    DR1TKCAL = as.numeric(DR1TKCAL),
    DR1TSUGR = as.numeric(DR1TSUGR),
    DR1TTFAT = as.numeric(DR1TTFAT),
    DR1TPROT = as.numeric(DR1TPROT),
    DR1TCARB = as.numeric(DR1TCARB)
  ) %>%
  na.omit()

#9. Descriptive Statistics

descriptive_table <- analysis_data %>%
  summarise(
    N = n(),
    mean_age = mean(RIDAGEYR),
    sd_age = sd(RIDAGEYR),
    
    mean_systolic = mean(avg_systolic),
    sd_systolic = sd(avg_systolic),
    
    mean_diastolic = mean(avg_diastolic),
    sd_diastolic = sd(avg_diastolic),
    
    mean_choc_grams = mean(choc_grams),
    sd_choc_grams = sd(choc_grams),
    
    mean_calories = mean(DR1TKCAL),
    sd_calories = sd(DR1TKCAL),
    
    mean_sugar = mean(DR1TSUGR),
    sd_sugar = sd(DR1TSUGR),
    
    mean_fat = mean(DR1TTFAT),
    sd_fat = sd(DR1TTFAT)
  )

print(descriptive_table)
##      N mean_age   sd_age mean_systolic sd_systolic mean_diastolic sd_diastolic
## 1 6125  41.7742 22.39373       121.772    19.99214       68.74623     15.31413
##   mean_choc_grams sd_choc_grams mean_calories sd_calories mean_sugar sd_sugar
## 1       0.3401469      5.111966      2090.101    1006.304    108.046 76.29649
##   mean_fat   sd_fat
## 1  84.2676 48.80818

#10. Group Comparison: Chocolate vs No Chocolate This is your closest version of high-flavanol vs low-flavanol comparison.

# GROUP SUMMARY BY CHOCOLATE INTAKE

group_summary <- analysis_data %>%
  group_by(chocolate) %>%
  summarise(
    n = n(),
    
    mean_systolic = mean(avg_systolic),
    sd_systolic = sd(avg_systolic),
    
    mean_diastolic = mean(avg_diastolic),
    sd_diastolic = sd(avg_diastolic),
    
    mean_age = mean(RIDAGEYR),
    sd_age = sd(RIDAGEYR),
    
    mean_choc_grams = mean(choc_grams),
    sd_choc_grams = sd(choc_grams),
    
    .groups = "drop"
  )

print(group_summary)
## # A tibble: 2 × 10
##   chocolate     n mean_systolic sd_systolic mean_diastolic sd_diastolic mean_age
##   <fct>     <int>         <dbl>       <dbl>          <dbl>        <dbl>    <dbl>
## 1 No         6086          122.        19.9           68.8         15.3     41.7
## 2 Yes          39          126.        26.4           67.8         17.2     47.5
## # ℹ 3 more variables: sd_age <dbl>, mean_choc_grams <dbl>, sd_choc_grams <dbl>

#11. T-TEST: SYSTOLIC BP BY CHOCOLATE GROUP

t_sbp <- t.test(
  avg_systolic ~ chocolate,
  data = analysis_data
)

print(t_sbp)
## 
##  Welch Two Sample t-test
## 
## data:  avg_systolic by chocolate
## t = -1.0184, df = 38.279, p-value = 0.3149
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -12.865632   4.252169
## sample estimates:
##  mean in group No mean in group Yes 
##          121.7446          126.0513

#12. T-TEST: DIASTOLIC BP BY CHOCOLATE GROUP

t_dbp <- t.test(
  avg_diastolic ~ chocolate,
  data = analysis_data
)

print(t_dbp)
## 
##  Welch Two Sample t-test
## 
## data:  avg_diastolic by chocolate
## t = 0.32761, df = 38.385, p-value = 0.745
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -4.689784  6.501474
## sample estimates:
##  mean in group No mean in group Yes 
##          68.75200          67.84615

Interpretation: If p-value is less than 0.05, average BP is significantly different between chocolate consumers and non-consumers. If p-value is greater than or equal to 0.05, there is no significant difference.

#13. Create Chocolate Intake Levels #This is similar to comparing low vs high flavanol groups, but based on grams.

# CREATE CHOCOLATE INTAKE LEVELS

analysis_data <- analysis_data %>%
  mutate(
    chocolate_level = case_when(
      choc_grams == 0 ~ "None",
      choc_grams > 0 & choc_grams <= 25 ~ "Low",
      choc_grams > 25 & choc_grams <= 50 ~ "Moderate",
      choc_grams > 50 ~ "High"
    ),
    
    chocolate_level = factor(
      chocolate_level,
      levels = c("None", "Low", "Moderate", "High")
    )
  )

table(analysis_data$chocolate_level)
## 
##     None      Low Moderate     High 
##     6086       10       10       19

#14. ANOVA: Compare BP Across Chocolate Intake Levels

# ANOVA: SYSTOLIC BP ACROSS CHOCOLATE LEVELS

anova_sbp <- aov(
  avg_systolic ~ chocolate_level,
  data = analysis_data
)

summary(anova_sbp)
##                   Df  Sum Sq Mean Sq F value Pr(>F)  
## chocolate_level    3    2657   885.5   2.217  0.084 .
## Residuals       6121 2445019   399.4                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#15. POST-HOC TEST IF ANOVA IS SIGNIFICANT

TukeyHSD(anova_sbp)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = avg_systolic ~ chocolate_level, data = analysis_data)
## 
## $chocolate_level
##                    diff        lwr       upr     p adj
## Low-None      -6.811217 -23.065777  9.443343 0.7037382
## Moderate-None 12.388783  -3.865777 28.643343 0.2039826
## High-None      5.904572  -5.896427 17.705571 0.5720521
## Moderate-Low  19.200000  -3.768557 42.168557 0.1381931
## High-Low      12.715789  -7.349291 32.780870 0.3625473
## High-Moderate -6.484211 -26.549291 13.580870 0.8400315
# ANOVA: DIASTOLIC BP ACROSS CHOCOLATE LEVELS

anova_dbp <- aov(
  avg_diastolic ~ chocolate_level,
  data = analysis_data
)

summary(anova_dbp)
##                   Df  Sum Sq Mean Sq F value Pr(>F)
## chocolate_level    3     907   302.3   1.289  0.276
## Residuals       6121 1435308   234.5

#POST-HOC TEST FOR DIASTOLIC BP

TukeyHSD(anova_dbp)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = avg_diastolic ~ chocolate_level, data = analysis_data)
## 
## $chocolate_level
##                     diff        lwr       upr     p adj
## Low-None       -4.485332 -16.939271  7.968606 0.7911925
## Moderate-None   7.114668  -5.339271 19.568606 0.4570110
## High-None      -3.243227 -12.284931  5.798476 0.7932178
## Moderate-Low   11.600000  -5.998076 29.198076 0.3269420
## High-Low        1.242105 -14.131382 16.615593 0.9968365
## High-Moderate -10.357895 -25.731382  5.015593 0.3073977

#16. Correlation Analysis #This checks whether higher chocolate/cocoa grams are associated with BP.

# CORRELATION: CHOCOLATE GRAMS VS SYSTOLIC BP

cor_sbp <- cor.test(
  analysis_data$choc_grams,
  analysis_data$avg_systolic,
  method = "pearson"
)

print(cor_sbp)
## 
##  Pearson's product-moment correlation
## 
## data:  analysis_data$choc_grams and analysis_data$avg_systolic
## t = 1.6395, df = 6123, p-value = 0.1012
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.004098458  0.045968372
## sample estimates:
##        cor 
## 0.02094809
# CORRELATION: CHOCOLATE GRAMS VS DIASTOLIC BP

cor_dbp <- cor.test(
  analysis_data$choc_grams,
  analysis_data$avg_diastolic,
  method = "pearson"
)

print(cor_dbp)
## 
##  Pearson's product-moment correlation
## 
## data:  analysis_data$choc_grams and analysis_data$avg_diastolic
## t = -0.45725, df = 6123, p-value = 0.6475
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03088329  0.01920379
## sample estimates:
##          cor 
## -0.005843415

#17. Regression Analysis Like Adjusted Paper-Style Analysis

#The paper used repeated-measures ANOVA because it had intervention and time. our dataset does not have repeated time points, so We use adjusted linear regression.

# MULTIVARIABLE LINEAR REGRESSION: SYSTOLIC BP

lm_sbp <- lm(
  avg_systolic ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR +
    DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
  data = analysis_data
)

summary(lm_sbp)
## 
## Call:
## lm(formula = avg_systolic ~ chocolate + choc_grams + RIDAGEYR + 
##     RIAGENDR + DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB, 
##     data = analysis_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.622  -9.922  -1.346   8.051 105.315 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    99.723105   0.728660 136.858  < 2e-16 ***
## chocolateYes    3.186095   4.592818   0.694 0.487889    
## choc_grams     -0.038625   0.071521  -0.540 0.589177    
## RIDAGEYR        0.539226   0.009139  59.005  < 2e-16 ***
## RIAGENDRFemale -1.642675   0.426154  -3.855 0.000117 ***
## DR1TKCAL        0.004845   0.001235   3.923 8.84e-05 ***
## DR1TSUGR       -0.005929   0.005250  -1.129 0.258830    
## DR1TTFAT       -0.038688   0.012812  -3.020 0.002540 ** 
## DR1TPROT       -0.027935   0.009268  -3.014 0.002587 ** 
## DR1TCARB       -0.014850   0.006174  -2.405 0.016190 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.89 on 6115 degrees of freedom
## Multiple R-squared:  0.3695, Adjusted R-squared:  0.3686 
## F-statistic: 398.2 on 9 and 6115 DF,  p-value: < 2.2e-16
# MULTIVARIABLE LINEAR REGRESSION: DIASTOLIC BP

lm_dbp <- lm(
  avg_diastolic ~ chocolate + choc_grams + RIDAGEYR + RIAGENDR +
    DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB,
  data = analysis_data
)

summary(lm_dbp)
## 
## Call:
## lm(formula = avg_diastolic ~ chocolate + choc_grams + RIDAGEYR + 
##     RIAGENDR + DR1TKCAL + DR1TSUGR + DR1TTFAT + DR1TPROT + DR1TCARB, 
##     data = analysis_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -77.570  -6.920   1.156   8.531  53.412 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    58.304412   0.660431  88.282  < 2e-16 ***
## chocolateYes    2.058550   4.162767   0.495  0.62096    
## choc_grams     -0.079371   0.064824  -1.224  0.22084    
## RIDAGEYR        0.217321   0.008283  26.237  < 2e-16 ***
## RIAGENDRFemale -1.245217   0.386251  -3.224  0.00127 ** 
## DR1TKCAL        0.007182   0.001119   6.416 1.51e-10 ***
## DR1TSUGR        0.002613   0.004759   0.549  0.58296    
## DR1TTFAT       -0.064129   0.011612  -5.523 3.48e-08 ***
## DR1TPROT       -0.007184   0.008400  -0.855  0.39245    
## DR1TCARB       -0.029302   0.005596  -5.236 1.69e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.4 on 6115 degrees of freedom
## Multiple R-squared:  0.1173, Adjusted R-squared:  0.116 
## F-statistic: 90.26 on 9 and 6115 DF,  p-value: < 2.2e-16

#18.Boxplot: Systolic BP by Chocolate Level

#This graph helps compare whether people with different chocolate/cocoa intake levels have different average systolic blood pressure.

ggplot(analysis_data, aes(x = chocolate_level, y = avg_systolic, fill = chocolate_level)) +
  geom_boxplot(alpha = 0.7) +
  geom_jitter(width = 0.15, alpha = 0.3, size = 1) +
  labs(
    title = "Average Systolic Blood Pressure by Chocolate/Cocoa Intake Level",
    x = "Chocolate/Cocoa Intake Level",
    y = "Average Systolic Blood Pressure"
  ) +
  scale_fill_brewer(palette = "Set2") +
  theme_minimal() +
  theme(
    legend.position = "none"
  )

Interpretation :- The boxplot compares average systolic blood pressure across chocolate/cocoa intake groups. If the median line is lower in higher chocolate intake groups, it may suggest that higher chocolate/cocoa intake is associated with lower systolic blood pressure. However, if the boxes overlap strongly, the difference between groups may be small and should be confirmed using statistical tests such as ANOVA or regression.

#19. Boxplot: Diastolic BP by Chocolate Level

ggplot(analysis_data, aes(x = chocolate_level, y = avg_diastolic, fill = chocolate_level)) +
  geom_boxplot(alpha = 0.7) +
  geom_jitter(width = 0.15, alpha = 0.3, size = 1) +
  labs(
    title = "Average Diastolic Blood Pressure by Chocolate/Cocoa Intake Level",
    x = "Chocolate/Cocoa Intake Level",
    y = "Average Diastolic Blood Pressure"
  ) +
  scale_fill_brewer(palette = "Set2") +
  theme_minimal() +
  theme(
    legend.position = "none"
  )

Interpretation : - This boxplot compares average diastolic blood pressure across chocolate/cocoa intake levels. The dots represent individual participants. The median diastolic BP appears fairly similar across groups, with the moderate intake group slightly higher, but statistical testing such as ANOVA or regression is needed to confirm whether the differences are meaningful.

#20. Scatterplot: Chocolate Grams vs Systolic BP

ggplot(analysis_data, aes(x = choc_grams, y = avg_systolic)) +
  geom_point(color = "steelblue", alpha = 0.4, size = 2) +
  geom_smooth(method = "lm", se = TRUE, color = "red", fill = "pink") +
  labs(
    title = "Chocolate/Cocoa Grams and Average Systolic Blood Pressure",
    x = "Chocolate/Cocoa Intake in Grams",
    y = "Average Systolic Blood Pressure"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Interpretation :- This scatter plot shows a weak positive relationship between chocolate/cocoa intake in grams and average systolic blood pressure. Most participants reported no chocolate/cocoa intake, and only a small number had higher intake levels. Therefore, the relationship is weak and should be confirmed using regression analysis and statistical significance testing.

#Final Interpretation:- The original cosmos study used a randomized, double-blind, placebo-controlled cross-over design to compare high-flavanol and low-flavanol cocoa interventions during and after an acute mental stress task. Outcomes included flow-mediated dilation, forearm blood flow, systolic blood pressure, and diastolic blood pressure, analyzed using two-way repeated measures ANOVA with intervention and time as within-subject factors.

In contrast, the NHANES dataset used in the present study is observational and cross-sectional. Therefore, repeated-measures intervention analysis could not be directly replicated. Instead, chocolate/cocoa intake was used as the exposure variable, and systolic and diastolic blood pressure were used as cardiovascular outcomes. Descriptive statistics, t-tests, ANOVA, correlation analysis, and multivariable linear regression were used to evaluate associations between chocolate/cocoa intake and blood pressure while adjusting for age, gender, and dietary intake.