#Part 0: Data Preparation (10 points)
# Import bmd.csv
data <- read.csv("C:\\Users\\safwa\\OneDrive - University at Albany - SUNY\\EPI 553\\Assignment\\bmd.csv")
# Load required packages
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.2
## Warning: package 'stringr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(knitr)
## Warning: package 'knitr' was built under R version 4.5.2
library(broom) # For clean statistical output
library(car)
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.2
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.5.2
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
#Check Column names
names(data)
## [1] "SEQN" "RIAGENDR" "RIDAGEYR" "RIDRETH1" "BMXBMI" "smoker"
## [7] "totmet" "metcat" "DXXOFBMD" "tbmdcat" "calcium" "vitd"
## [13] "DSQTVD" "DSQTCALC"
#Convert RIDRETH1 to factor with meaningful ethnicity labels
data <- data %>%
mutate(
RIDRETH1 = factor(
RIDRETH1,
levels = c(1, 2, 3, 4, 5),
labels = c("Mexican American",
"Other Hispanic",
"Non-Hispanic White",
"Non-Hispanic Black",
"Other/Multi-Racial")
)
)
#Convert RIAGENDR to factor (Male/Female)
data <- data %>%
mutate(
RIAGENDR = factor(
RIAGENDR,
levels = c(1, 2),
labels = c("Male", "Female")
)
)
data <- data %>%
mutate(
smoker = factor(
smoker,
levels = c(1, 2, 3),
labels = c("Current", "Past", "Never")
)
)
# Total sample size
n_total <- nrow(data)
n_total
## [1] 2898
analytic_data <- data %>%
drop_na(DXXOFBMD, RIDRETH1, RIAGENDR, smoker)
#Report final analytic sample size
n_analytic <- nrow(analytic_data)
cat("Final analytic sample size:", n_analytic)
## Final analytic sample size: 2286
#Part 1: One-Way ANOVA (50 points) Research Question: Do mean BMD values differ across ethnicity groups? Ans : H0:μ1=μ2=μ3=μ4=μ5 𝐻A:At least one group mean differs Where:
μ₁ = Mean BMD (Mexican American)
μ₂ = Mean BMD (Other Hispanic)
μ₃ = Mean BMD (Non-Hispanic White)
μ₄ = Mean BMD (Non-Hispanic Black)
μ₅ = Mean BMD (Other)
Null hypothesis (H₀): Average bone mineral density is the same across all ethnicity groups. Alternative hypothesis (Hₐ): At least one ethnicity group has a different average bone mineral density.
#Step 2: Exploratory Analysis (10 points)
library(dplyr)
library(kableExtra)
summary_table <- analytic_data %>%
group_by(RIDRETH1) %>%
summarize(
n = n(),
mean_BMD = mean(DXXOFBMD, na.rm = TRUE),
sd_BMD = sd(DXXOFBMD, na.rm = TRUE)
)
summary_table %>%
kable(digits = 3, caption = "BMD by Ethnicity") %>%
kable_styling(full_width = FALSE)
| RIDRETH1 | n | mean_BMD | sd_BMD |
|---|---|---|---|
| Mexican American | 255 | 0.950 | 0.149 |
| Other Hispanic | 244 | 0.946 | 0.156 |
| Non-Hispanic White | 846 | 0.888 | 0.160 |
| Non-Hispanic Black | 532 | 0.973 | 0.161 |
| Other/Multi-Racial | 409 | 0.897 | 0.148 |
library(ggplot2)
ggplot(analytic_data, aes(x = RIDRETH1, y = DXXOFBMD)) +
geom_boxplot() +
geom_jitter(width = 0.2, alpha = 0.4) +
labs(
title = "Distribution of BMD by Ethnicity",
x = "Ethnicity",
y = "Bone Mineral Density (BMD)"
) +
theme_minimal()
#Step 3: ANOVA F-test (10 points)
anova_model <- aov(DXXOFBMD ~ RIDRETH1, data = analytic_data)
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## RIDRETH1 4 2.95 0.7371 30.18 <2e-16 ***
## Residuals 2281 55.70 0.0244
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(broom)
anova_results <- tidy(anova_model)
anova_results %>%
kable(digits = 4, caption = "ANOVA Results") %>%
kable_styling(full_width = FALSE)
| term | df | sumsq | meansq | statistic | p.value |
|---|---|---|---|---|---|
| RIDRETH1 | 4 | 2.9482 | 0.7371 | 30.185 | 0 |
| Residuals | 2281 | 55.6973 | 0.0244 | NA | NA |
#Report F-statistic, df, p-value in formatted table #State decision (reject/fail to reject H_0) and interpret The ANOVA test showed F(4, 2281) = 30.19, p < 0.001. Since p < 0.05, we reject the null hypothesis. This suggests that mean BMD does differ significantly across ethnicity groups.
#Step 4: Assumption Checks (10 points)
#A. Normality (Q-Q Plot)
qqnorm(residuals(anova_model))
qqline(residuals(anova_model))
#B. Equal Variances (Levene’s Test)
library(car)
leveneTest(DXXOFBMD ~ RIDRETH1, data = analytic_data)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 4 1.5728 0.1788
## 2281
#Step 5: Post-hoc Comparisons (10 points)
#Conduct Tukey HSD test
tukey_results <- TukeyHSD(anova_model)
tukey_results
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = DXXOFBMD ~ RIDRETH1, data = analytic_data)
##
## $RIDRETH1
## diff lwr upr
## Other Hispanic-Mexican American -0.004441273 -0.042644130 0.03376158
## Non-Hispanic White-Mexican American -0.062377249 -0.092852618 -0.03190188
## Non-Hispanic Black-Mexican American 0.022391619 -0.010100052 0.05488329
## Other/Multi-Racial-Mexican American -0.053319344 -0.087357253 -0.01928144
## Non-Hispanic White-Other Hispanic -0.057935976 -0.088934694 -0.02693726
## Non-Hispanic Black-Other Hispanic 0.026832892 -0.006150151 0.05981593
## Other/Multi-Racial-Other Hispanic -0.048878071 -0.083385341 -0.01437080
## Non-Hispanic Black-Non-Hispanic White 0.084768868 0.061164402 0.10837333
## Other/Multi-Racial-Non-Hispanic White 0.009057905 -0.016633367 0.03474918
## Other/Multi-Racial-Non-Hispanic Black -0.075710963 -0.103764519 -0.04765741
## p adj
## Other Hispanic-Mexican American 0.9978049
## Non-Hispanic White-Mexican American 0.0000003
## Non-Hispanic Black-Mexican American 0.3276613
## Other/Multi-Racial-Mexican American 0.0001919
## Non-Hispanic White-Other Hispanic 0.0000036
## Non-Hispanic Black-Other Hispanic 0.1722466
## Other/Multi-Racial-Other Hispanic 0.0010720
## Non-Hispanic Black-Non-Hispanic White 0.0000000
## Other/Multi-Racial-Non-Hispanic White 0.8719109
## Other/Multi-Racial-Non-Hispanic Black 0.0000000
#Create formatted table showing:
tukey_df <- as.data.frame(tukey_results$RIDRETH1)
tukey_df$Comparison <- rownames(tukey_df)
tukey_df %>%
select(Comparison, diff, lwr, upr, `p adj`) %>%
kable(digits = 4, caption = "Tukey Post-Hoc Comparisons") %>%
kable_styling(full_width = FALSE)
| Comparison | diff | lwr | upr | p adj | |
|---|---|---|---|---|---|
| Other Hispanic-Mexican American | Other Hispanic-Mexican American | -0.0044 | -0.0426 | 0.0338 | 0.9978 |
| Non-Hispanic White-Mexican American | Non-Hispanic White-Mexican American | -0.0624 | -0.0929 | -0.0319 | 0.0000 |
| Non-Hispanic Black-Mexican American | Non-Hispanic Black-Mexican American | 0.0224 | -0.0101 | 0.0549 | 0.3277 |
| Other/Multi-Racial-Mexican American | Other/Multi-Racial-Mexican American | -0.0533 | -0.0874 | -0.0193 | 0.0002 |
| Non-Hispanic White-Other Hispanic | Non-Hispanic White-Other Hispanic | -0.0579 | -0.0889 | -0.0269 | 0.0000 |
| Non-Hispanic Black-Other Hispanic | Non-Hispanic Black-Other Hispanic | 0.0268 | -0.0062 | 0.0598 | 0.1722 |
| Other/Multi-Racial-Other Hispanic | Other/Multi-Racial-Other Hispanic | -0.0489 | -0.0834 | -0.0144 | 0.0011 |
| Non-Hispanic Black-Non-Hispanic White | Non-Hispanic Black-Non-Hispanic White | 0.0848 | 0.0612 | 0.1084 | 0.0000 |
| Other/Multi-Racial-Non-Hispanic White | Other/Multi-Racial-Non-Hispanic White | 0.0091 | -0.0166 | 0.0347 | 0.8719 |
| Other/Multi-Racial-Non-Hispanic Black | Other/Multi-Racial-Non-Hispanic Black | -0.0757 | -0.1038 | -0.0477 | 0.0000 |
#Interpret which groups significantly differ
#Calculate eta-squared: eta-squared = SS_between / SS_total
anova_summary <- summary(anova_model)[[1]]
SS_between <- anova_summary["RIDRETH1", "Sum Sq"]
SS_total <- sum(anova_summary[, "Sum Sq"])
eta_sq <- SS_between / SS_total
eta_sq
## [1] 0.05027196
#Interpret using Cohen’s guidelines (small approximately 0.01, medium approximately 0.06, large approximately 0.14) Since 0.0503 is between 0.01 and 0.06 and slightly below 0.06, this represents a small-to-moderate effect size, closer to a medium effect.
#Part 2: Correlation Analysis (40 points)
library(dplyr)
analytic_data <- analytic_data %>%
mutate(
DSQTCALC = ifelse(is.na(DSQTCALC), 0, DSQTCALC),
DSQTVD = ifelse(is.na(DSQTVD), 0, DSQTVD),
calcium_total = calcium + DSQTCALC,
vitd_total = vitd + DSQTVD
)
#Step 2: Assess and Apply Transformations (5 points)
library(ggplot2)
# BMI
ggplot(analytic_data, aes(x = BMXBMI)) +
geom_histogram(bins = 30) +
theme_minimal() +
labs(title = "BMI Distribution")
## Warning: Removed 11 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Calcium
ggplot(analytic_data, aes(x = calcium_total)) +
geom_histogram(bins = 30) +
theme_minimal() +
labs(title = "Total Calcium Distribution")
## Warning: Removed 157 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Vitamin D
ggplot(analytic_data, aes(x = vitd_total)) +
geom_histogram(bins = 30) +
theme_minimal() +
labs(title = "Total Vitamin D Distribution")
## Warning: Removed 157 rows containing non-finite outside the scale range
## (`stat_bin()`).
# MET
ggplot(analytic_data, aes(x = totmet)) +
geom_histogram(bins = 30) +
theme_minimal() +
labs(title = "MET Distribution")
## Warning: Removed 698 rows containing non-finite outside the scale range
## (`stat_bin()`).
#Step 3: Three Correlation Tables
library(broom)
analytic_data <- analytic_data %>%
mutate(
RIDAGEYR = as.numeric(as.character(RIDAGEYR)),
BMXBMI = as.numeric(as.character(BMXBMI)),
calcium_total = as.numeric(as.character(calcium_total)),
vitd_total = as.numeric(as.character(vitd_total)),
totmet = as.numeric(as.character(totmet))
)
analytic_data <- analytic_data %>%
mutate(
log_BMI = log(BMXBMI),
log_calcium = log(calcium_total + 1),
sqrt_vitd = sqrt(vitd_total),
sqrt_totmet = sqrt(totmet)
)
corr_pair <- function(data, var1, var2) {
test <- cor.test(data[[var1]], data[[var2]], method = "pearson")
tibble(
Variable_1 = var1,
Variable_2 = var2,
r = test$estimate,
p_value = test$p.value,
n = sum(complete.cases(data[[var1]], data[[var2]]))
)
}
#Table A
tableA <- bind_rows(
corr_pair(analytic_data, "RIDAGEYR", "BMXBMI"),
corr_pair(analytic_data, "log_BMI", "BMXBMI"),
corr_pair(analytic_data, "log_calcium", "BMXBMI"),
corr_pair(analytic_data, "sqrt_vitd", "BMXBMI")
)
tableA
## # A tibble: 4 × 5
## Variable_1 Variable_2 r p_value n
## <chr> <chr> <dbl> <dbl> <int>
## 1 RIDAGEYR BMXBMI -0.0981 0.00000274 2275
## 2 log_BMI BMXBMI 0.989 0 2275
## 3 log_calcium BMXBMI -0.00214 0.921 2122
## 4 sqrt_vitd BMXBMI 0.0285 0.189 2122
#Table B
tableB <- bind_rows(
corr_pair(analytic_data, "RIDAGEYR", "sqrt_totmet"),
corr_pair(analytic_data, "log_BMI", "sqrt_totmet"),
corr_pair(analytic_data, "log_calcium", "sqrt_totmet"),
corr_pair(analytic_data, "sqrt_vitd", "sqrt_totmet")
)
tableB
## # A tibble: 4 × 5
## Variable_1 Variable_2 r p_value n
## <chr> <chr> <dbl> <dbl> <int>
## 1 RIDAGEYR sqrt_totmet -0.105 0.0000267 1588
## 2 log_BMI sqrt_totmet 0.00863 0.732 1582
## 3 log_calcium sqrt_totmet 0.0677 0.00875 1500
## 4 sqrt_vitd sqrt_totmet -0.00349 0.893 1500
#Table C
vars <- c("RIDAGEYR", "log_BMI", "log_calcium", "sqrt_vitd")
pairs <- combn(vars, 2, simplify = FALSE)
tableC <- purrr::map_dfr(pairs, ~ corr_pair(analytic_data, .x[1], .x[2]))
tableC
## # A tibble: 6 × 5
## Variable_1 Variable_2 r p_value n
## <chr> <chr> <dbl> <dbl> <int>
## 1 RIDAGEYR log_BMI -0.0928 9.28e- 6 2275
## 2 RIDAGEYR log_calcium 0.0666 2.10e- 3 2129
## 3 RIDAGEYR sqrt_vitd 0.152 2.02e-12 2129
## 4 log_BMI log_calcium 0.00878 6.86e- 1 2122
## 5 log_BMI sqrt_vitd 0.0263 2.26e- 1 2122
## 6 log_calcium sqrt_vitd 0.291 6.21e-43 2129
#Part 3: Reflection (10 points) Write 200–400 words addressing: A. Comparing Methods (3 points) - When to use ANOVA vs. correlation? - Key differences in what they tell us - Example research questions for each Ans: ANOVA is used when comparing the mean of a continuous outcome across two or more categorical groups. In this assignment, ANOVA was appropriate for examining whether mean BMD differs across ethnicity groups, since ethnicity is categorical and BMD is continuous.Correlation is used to examine the strength and direction of association between two continuous variables. In this assignment, correlation was appropriate for assessing relationships between variables such as age, BMI, nutrient intake, physical activity, and BMD.
B. Assumptions & Limitations (4 points) - Which assumptions were challenging? - How might violations affect conclusions? - Limitations of cross-sectional correlation for causal inference Ans: The most challenging assumptions were: Normality of residuals in ANOVA Homogeneity of variance across ethnicity groups Normality and linearity for correlation Some variables showed skewed distributions, requiring log and square-root transformations.
C. R Programming Challenge (3 points) - What was most difficult? - How did you solve it? - What skill did you strengthen?
Ans : The most difficult aspect of the R analysis was managing variable types and transformations. Several variables were initially stored as factors rather than numeric, which caused errors during correlation analysis (e.g., “x must be a numeric vector”). To solve this googled it and got help from stack overflow.