#Part 0: Data Preparation (10 points)

# Import bmd.csv
data <- read.csv("C:\\Users\\safwa\\OneDrive - University at Albany - SUNY\\EPI 553\\Assignment\\bmd.csv")

# Load required packages
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.2
## Warning: package 'stringr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(knitr)
## Warning: package 'knitr' was built under R version 4.5.2
library(broom)      # For clean statistical output
library(car)  
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.2
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.5.2
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

#Check Column names

names(data)
##  [1] "SEQN"     "RIAGENDR" "RIDAGEYR" "RIDRETH1" "BMXBMI"   "smoker"  
##  [7] "totmet"   "metcat"   "DXXOFBMD" "tbmdcat"  "calcium"  "vitd"    
## [13] "DSQTVD"   "DSQTCALC"

#Convert RIDRETH1 to factor with meaningful ethnicity labels

data <- data %>%
  mutate(
    RIDRETH1 = factor(
      RIDRETH1,
      levels = c(1, 2, 3, 4, 5),
      labels = c("Mexican American",
                 "Other Hispanic",
                 "Non-Hispanic White",
                 "Non-Hispanic Black",
                 "Other/Multi-Racial")
    )
  )

#Convert RIAGENDR to factor (Male/Female)

data <- data %>%
  mutate(
    RIAGENDR = factor(
      RIAGENDR,
      levels = c(1, 2),
      labels = c("Male", "Female")
    )
  )

Convert smoker to factor (Current/Past/Never)

data <- data %>%
  mutate(
    smoker = factor(
      smoker,
      levels = c(1, 2, 3),
      labels = c("Current", "Past", "Never")
    )
  )

Report total sample size

# Total sample size
n_total <- nrow(data)
n_total
## [1] 2898

Create analytic dataset removing missing BMD and ethnicity

analytic_data <- data %>%
  drop_na(DXXOFBMD, RIDRETH1, RIAGENDR, smoker)

#Report final analytic sample size

n_analytic <- nrow(analytic_data)

cat("Final analytic sample size:", n_analytic)
## Final analytic sample size: 2286

#Part 1: One-Way ANOVA (50 points) Research Question: Do mean BMD values differ across ethnicity groups? Ans : H0:μ1=μ2=μ3=μ4=μ5 𝐻A:At least one group mean differs Where:

μ₁ = Mean BMD (Mexican American)

μ₂ = Mean BMD (Other Hispanic)

μ₃ = Mean BMD (Non-Hispanic White)

μ₄ = Mean BMD (Non-Hispanic Black)

μ₅ = Mean BMD (Other)

Null hypothesis (H₀): Average bone mineral density is the same across all ethnicity groups. Alternative hypothesis (Hₐ): At least one ethnicity group has a different average bone mineral density.

#Step 2: Exploratory Analysis (10 points)

library(dplyr)
library(kableExtra)

summary_table <- analytic_data %>%
  group_by(RIDRETH1) %>%
  summarize(
    n = n(),
    mean_BMD = mean(DXXOFBMD, na.rm = TRUE),
    sd_BMD = sd(DXXOFBMD, na.rm = TRUE)
  )

summary_table %>%
  kable(digits = 3, caption = "BMD by Ethnicity") %>%
  kable_styling(full_width = FALSE)
BMD by Ethnicity
RIDRETH1 n mean_BMD sd_BMD
Mexican American 255 0.950 0.149
Other Hispanic 244 0.946 0.156
Non-Hispanic White 846 0.888 0.160
Non-Hispanic Black 532 0.973 0.161
Other/Multi-Racial 409 0.897 0.148
library(ggplot2)

ggplot(analytic_data, aes(x = RIDRETH1, y = DXXOFBMD)) +
  geom_boxplot() +
  geom_jitter(width = 0.2, alpha = 0.4) +
  labs(
    title = "Distribution of BMD by Ethnicity",
    x = "Ethnicity",
    y = "Bone Mineral Density (BMD)"
  ) +
  theme_minimal()

#Step 3: ANOVA F-test (10 points)

anova_model <- aov(DXXOFBMD ~ RIDRETH1, data = analytic_data)
summary(anova_model)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## RIDRETH1       4   2.95  0.7371   30.18 <2e-16 ***
## Residuals   2281  55.70  0.0244                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(broom)

anova_results <- tidy(anova_model)

anova_results %>%
  kable(digits = 4, caption = "ANOVA Results") %>%
  kable_styling(full_width = FALSE)
ANOVA Results
term df sumsq meansq statistic p.value
RIDRETH1 4 2.9482 0.7371 30.185 0
Residuals 2281 55.6973 0.0244 NA NA

#Report F-statistic, df, p-value in formatted table #State decision (reject/fail to reject H_0) and interpret The ANOVA test showed F(4, 2281) = 30.19, p < 0.001. Since p < 0.05, we reject the null hypothesis. This suggests that mean BMD does differ significantly across ethnicity groups.

#Step 4: Assumption Checks (10 points)

#A. Normality (Q-Q Plot)
qqnorm(residuals(anova_model))
qqline(residuals(anova_model))

#B. Equal Variances (Levene’s Test)
library(car)

leveneTest(DXXOFBMD ~ RIDRETH1, data = analytic_data)
## Levene's Test for Homogeneity of Variance (center = median)
##         Df F value Pr(>F)
## group    4  1.5728 0.1788
##       2281

#Step 5: Post-hoc Comparisons (10 points)

#Conduct Tukey HSD test
tukey_results <- TukeyHSD(anova_model)

tukey_results
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = DXXOFBMD ~ RIDRETH1, data = analytic_data)
## 
## $RIDRETH1
##                                               diff          lwr         upr
## Other Hispanic-Mexican American       -0.004441273 -0.042644130  0.03376158
## Non-Hispanic White-Mexican American   -0.062377249 -0.092852618 -0.03190188
## Non-Hispanic Black-Mexican American    0.022391619 -0.010100052  0.05488329
## Other/Multi-Racial-Mexican American   -0.053319344 -0.087357253 -0.01928144
## Non-Hispanic White-Other Hispanic     -0.057935976 -0.088934694 -0.02693726
## Non-Hispanic Black-Other Hispanic      0.026832892 -0.006150151  0.05981593
## Other/Multi-Racial-Other Hispanic     -0.048878071 -0.083385341 -0.01437080
## Non-Hispanic Black-Non-Hispanic White  0.084768868  0.061164402  0.10837333
## Other/Multi-Racial-Non-Hispanic White  0.009057905 -0.016633367  0.03474918
## Other/Multi-Racial-Non-Hispanic Black -0.075710963 -0.103764519 -0.04765741
##                                           p adj
## Other Hispanic-Mexican American       0.9978049
## Non-Hispanic White-Mexican American   0.0000003
## Non-Hispanic Black-Mexican American   0.3276613
## Other/Multi-Racial-Mexican American   0.0001919
## Non-Hispanic White-Other Hispanic     0.0000036
## Non-Hispanic Black-Other Hispanic     0.1722466
## Other/Multi-Racial-Other Hispanic     0.0010720
## Non-Hispanic Black-Non-Hispanic White 0.0000000
## Other/Multi-Racial-Non-Hispanic White 0.8719109
## Other/Multi-Racial-Non-Hispanic Black 0.0000000
#Create formatted table showing:
tukey_df <- as.data.frame(tukey_results$RIDRETH1)

tukey_df$Comparison <- rownames(tukey_df)

tukey_df %>%
  select(Comparison, diff, lwr, upr, `p adj`) %>%
  kable(digits = 4, caption = "Tukey Post-Hoc Comparisons") %>%
  kable_styling(full_width = FALSE)
Tukey Post-Hoc Comparisons
Comparison diff lwr upr p adj
Other Hispanic-Mexican American Other Hispanic-Mexican American -0.0044 -0.0426 0.0338 0.9978
Non-Hispanic White-Mexican American Non-Hispanic White-Mexican American -0.0624 -0.0929 -0.0319 0.0000
Non-Hispanic Black-Mexican American Non-Hispanic Black-Mexican American 0.0224 -0.0101 0.0549 0.3277
Other/Multi-Racial-Mexican American Other/Multi-Racial-Mexican American -0.0533 -0.0874 -0.0193 0.0002
Non-Hispanic White-Other Hispanic Non-Hispanic White-Other Hispanic -0.0579 -0.0889 -0.0269 0.0000
Non-Hispanic Black-Other Hispanic Non-Hispanic Black-Other Hispanic 0.0268 -0.0062 0.0598 0.1722
Other/Multi-Racial-Other Hispanic Other/Multi-Racial-Other Hispanic -0.0489 -0.0834 -0.0144 0.0011
Non-Hispanic Black-Non-Hispanic White Non-Hispanic Black-Non-Hispanic White 0.0848 0.0612 0.1084 0.0000
Other/Multi-Racial-Non-Hispanic White Other/Multi-Racial-Non-Hispanic White 0.0091 -0.0166 0.0347 0.8719
Other/Multi-Racial-Non-Hispanic Black Other/Multi-Racial-Non-Hispanic Black -0.0757 -0.1038 -0.0477 0.0000
#Interpret which groups significantly differ

Step 6: Effect Size (5 points)

#Calculate eta-squared: eta-squared = SS_between / SS_total
anova_summary <- summary(anova_model)[[1]]

SS_between <- anova_summary["RIDRETH1", "Sum Sq"]
SS_total <- sum(anova_summary[, "Sum Sq"])

eta_sq <- SS_between / SS_total

eta_sq
## [1] 0.05027196

#Interpret using Cohen’s guidelines (small approximately 0.01, medium approximately 0.06, large approximately 0.14) Since 0.0503 is between 0.01 and 0.06 and slightly below 0.06, this represents a small-to-moderate effect size, closer to a medium effect.

#Part 2: Correlation Analysis (40 points)

library(dplyr)

analytic_data <- analytic_data %>%
  mutate(
    DSQTCALC = ifelse(is.na(DSQTCALC), 0, DSQTCALC),
    DSQTVD   = ifelse(is.na(DSQTVD), 0, DSQTVD),
    
    calcium_total = calcium + DSQTCALC,
    vitd_total    = vitd + DSQTVD
  )

#Step 2: Assess and Apply Transformations (5 points)

library(ggplot2)

# BMI
ggplot(analytic_data, aes(x = BMXBMI)) +
  geom_histogram(bins = 30) +
  theme_minimal() +
  labs(title = "BMI Distribution")
## Warning: Removed 11 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Calcium
ggplot(analytic_data, aes(x = calcium_total)) +
  geom_histogram(bins = 30) +
  theme_minimal() +
  labs(title = "Total Calcium Distribution")
## Warning: Removed 157 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Vitamin D
ggplot(analytic_data, aes(x = vitd_total)) +
  geom_histogram(bins = 30) +
  theme_minimal() +
  labs(title = "Total Vitamin D Distribution")
## Warning: Removed 157 rows containing non-finite outside the scale range
## (`stat_bin()`).

# MET
ggplot(analytic_data, aes(x = totmet)) +
  geom_histogram(bins = 30) +
  theme_minimal() +
  labs(title = "MET Distribution")
## Warning: Removed 698 rows containing non-finite outside the scale range
## (`stat_bin()`).

#Step 3: Three Correlation Tables

library(broom)
analytic_data <- analytic_data %>%
  mutate(
    RIDAGEYR = as.numeric(as.character(RIDAGEYR)),
    BMXBMI  = as.numeric(as.character(BMXBMI)),
    calcium_total = as.numeric(as.character(calcium_total)),
    vitd_total = as.numeric(as.character(vitd_total)),
    totmet = as.numeric(as.character(totmet))
  )
analytic_data <- analytic_data %>%
  mutate(
    log_BMI    = log(BMXBMI),
    log_calcium = log(calcium_total + 1),
    sqrt_vitd   = sqrt(vitd_total),
    sqrt_totmet    = sqrt(totmet)
  )

corr_pair <- function(data, var1, var2) {
  
  test <- cor.test(data[[var1]], data[[var2]], method = "pearson")
  
  tibble(
    Variable_1 = var1,
    Variable_2 = var2,
    r = test$estimate,
    p_value = test$p.value,
    n = sum(complete.cases(data[[var1]], data[[var2]]))
  )
}
#Table A
tableA <- bind_rows(
  corr_pair(analytic_data, "RIDAGEYR", "BMXBMI"),
  corr_pair(analytic_data, "log_BMI", "BMXBMI"),
  corr_pair(analytic_data, "log_calcium", "BMXBMI"),
  corr_pair(analytic_data, "sqrt_vitd", "BMXBMI")
)
tableA
## # A tibble: 4 × 5
##   Variable_1  Variable_2        r    p_value     n
##   <chr>       <chr>         <dbl>      <dbl> <int>
## 1 RIDAGEYR    BMXBMI     -0.0981  0.00000274  2275
## 2 log_BMI     BMXBMI      0.989   0           2275
## 3 log_calcium BMXBMI     -0.00214 0.921       2122
## 4 sqrt_vitd   BMXBMI      0.0285  0.189       2122
#Table B
tableB <- bind_rows(
  corr_pair(analytic_data, "RIDAGEYR", "sqrt_totmet"),
  corr_pair(analytic_data, "log_BMI", "sqrt_totmet"),
  corr_pair(analytic_data, "log_calcium", "sqrt_totmet"),
  corr_pair(analytic_data, "sqrt_vitd", "sqrt_totmet")
)
tableB
## # A tibble: 4 × 5
##   Variable_1  Variable_2         r   p_value     n
##   <chr>       <chr>          <dbl>     <dbl> <int>
## 1 RIDAGEYR    sqrt_totmet -0.105   0.0000267  1588
## 2 log_BMI     sqrt_totmet  0.00863 0.732      1582
## 3 log_calcium sqrt_totmet  0.0677  0.00875    1500
## 4 sqrt_vitd   sqrt_totmet -0.00349 0.893      1500
#Table C
vars <- c("RIDAGEYR", "log_BMI", "log_calcium", "sqrt_vitd")

pairs <- combn(vars, 2, simplify = FALSE)

tableC <- purrr::map_dfr(pairs, ~ corr_pair(analytic_data, .x[1], .x[2]))

tableC
## # A tibble: 6 × 5
##   Variable_1  Variable_2         r  p_value     n
##   <chr>       <chr>          <dbl>    <dbl> <int>
## 1 RIDAGEYR    log_BMI     -0.0928  9.28e- 6  2275
## 2 RIDAGEYR    log_calcium  0.0666  2.10e- 3  2129
## 3 RIDAGEYR    sqrt_vitd    0.152   2.02e-12  2129
## 4 log_BMI     log_calcium  0.00878 6.86e- 1  2122
## 5 log_BMI     sqrt_vitd    0.0263  2.26e- 1  2122
## 6 log_calcium sqrt_vitd    0.291   6.21e-43  2129

#Part 3: Reflection (10 points) Write 200–400 words addressing: A. Comparing Methods (3 points) - When to use ANOVA vs. correlation? - Key differences in what they tell us - Example research questions for each Ans: ANOVA is used when comparing the mean of a continuous outcome across two or more categorical groups. In this assignment, ANOVA was appropriate for examining whether mean BMD differs across ethnicity groups, since ethnicity is categorical and BMD is continuous.Correlation is used to examine the strength and direction of association between two continuous variables. In this assignment, correlation was appropriate for assessing relationships between variables such as age, BMI, nutrient intake, physical activity, and BMD.

B. Assumptions & Limitations (4 points) - Which assumptions were challenging? - How might violations affect conclusions? - Limitations of cross-sectional correlation for causal inference Ans: The most challenging assumptions were: Normality of residuals in ANOVA Homogeneity of variance across ethnicity groups Normality and linearity for correlation Some variables showed skewed distributions, requiring log and square-root transformations.

C. R Programming Challenge (3 points) - What was most difficult? - How did you solve it? - What skill did you strengthen?

Ans : The most difficult aspect of the R analysis was managing variable types and transformations. Several variables were initially stored as factors rather than numeric, which caused errors during correlation analysis (e.g., “x must be a numeric vector”). To solve this googled it and got help from stack overflow.