BMD <- read.csv("data/bmd.csv")

PART 0 Prepare the dataset

#Select key variables and rename
BMD_clean <- BMD %>%
  select(SEQN, DXXOFBMD, RIDRETH1, RIAGENDR, RIDAGEYR, BMXBMI, smoker, calcium, DSQTCALC, vitd, DSQTVD, totmet) %>%
  rename(BMD = DXXOFBMD,
          Race = RIDRETH1,
          Sex = RIAGENDR,
          Age = RIDAGEYR,
          BMI = BMXBMI,
          supp_calcium = DSQTCALC,
          diet_vitamin_D = vitd,
          supp_vitamin_D = DSQTVD,
          Physical_activity = totmet)

BMD_clean_2 <- BMD_clean %>%
  mutate(Race = factor(Race, levels=c(1,2,3,4,5), 
                labels=c("Mexican American", "Other Hispanic", "Non-Hispanic, White", "Non-Hispanic Black", "Other"))) %>%
  mutate(Sex = factor(Sex, levels=c(1,2), labels=c("Male", "Female"))) %>%
  mutate(smoker = factor(smoker, levels=c(1,2,3), labels=c("Current", "Past", "Never")))

#Display N before removing missing
print(nrow(BMD_clean_2)) #2898

## [1] 2898

#Percent missing report for BMD and race
mean(is.na(BMD_clean_2$BMD)) * 100 #21%

## [1] 21.11801

mean(is.na(BMD_clean_2$Race)) * 100 #0%

## [1] 0

#Remove missing values from race and BMD
data_1 <- BMD_clean_2 %>%
  filter(!is.na(BMD))

#Display N after removing missing
print(nrow(data_1)) #2286

## [1] 2286

PART 1 One-Way ANOVA

#Research Question: Do mean BMD values differ across ethnicity groups?

#Step 1 - Hypotheses #Null Hypothesis (H₀): μ_1 = μ_2 = μ_3 = μ_4 = μ_5 - All five ethnicity groups BMD means are equal) #Alternative Hypothesis (H₁): At least one ethnicity BMD mean differs from the others

#Step 2 - Exploratory Analysis
# Create summary table of BMD by ethnicity
BMD_by_ethnicity <- data_1 %>%
  group_by(Race) %>%
  summarise(
      N = n(),
      Mean_BMD = round(mean(BMD, na.rm = TRUE), 2),
      SD_BMD = sd(BMD, na.rm = TRUE),
      Lower_BMD = (Mean_BMD - SD_BMD),
      Upper_BMD = (Mean_BMD + SD_BMD))

summary_table <- BMD_by_ethnicity %>%
  select(
    Race,
    N,
    Mean_BMD,
    Lower_BMD,
    Upper_BMD)

kable(summary_table, 
      caption = "Bone Marrow Density by Ethnicity",
      format = "html") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE)

Bone Marrow Density by Ethnicity
Race	N	Mean_BMD	Lower_BMD	Upper_BMD
Mexican American	255	0.95	0.8011257	1.098874
Other Hispanic	244	0.95	0.7944264	1.105574
Non-Hispanic, White	846	0.89	0.7302357	1.049764
Non-Hispanic Black	532	0.97	0.8091685	1.130831
Other	409	0.90	0.7524478	1.047552

#Create plot of BMD by ethnicity
ggplot(data=data_1, aes(x= Race, y=BMD)) +
  geom_boxplot(fill = "white", colour = "darkblue") +
  ylab("Bone Marrow Density") +
  ggtitle("Box plot of Bone Marrow Density by Ethnicity") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#Step 3 - ANOVA F-test
# Outcome: BMD
# Predictor: Race

# Fit the one-way ANOVA model
anova_model <- aov(BMD ~ Race, data = data_1)

# Display the ANOVA table
summary(anova_model)

##               Df Sum Sq Mean Sq F value Pr(>F)    
## Race           4   2.95  0.7371   30.18 <2e-16 ***
## Residuals   2281  55.70  0.0244                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova_table <- summary(anova_model)
print(anova_table)

##               Df Sum Sq Mean Sq F value Pr(>F)    
## Race           4   2.95  0.7371   30.18 <2e-16 ***
## Residuals   2281  55.70  0.0244                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova_df <- data.frame(
  Source = c("Ethnicity", "Residuals"),
  DF = c(anova_table[[1]]$Df[1], anova_table[[1]]$Df[2]),
  `Sum Sq` = c(anova_table[[1]]$`Sum Sq`[1], anova_table[[1]]$`Sum Sq`[2]),
  `Mean Sq` = c(anova_table[[1]]$`Mean Sq`[1], anova_table[[1]]$`Mean Sq`[2]),
  `F value` = c(anova_table[[1]]$`F value`[1], NA),
  `Pr(>F)` = c(anova_table[[1]]$`Pr(>F)`[1], NA)
)

kable(anova_df, digits = 4,
      caption = "Bone Marrow Density by Ethnicity",
      col.names = c("Source", "DF", "Sum Sq", "Mean Sq", "F value", "p-value"),
      format = "html") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE)

Bone Marrow Density by Ethnicity
Source	DF	Sum Sq	Mean Sq	F value	p-value
Ethnicity	4	2.9482	0.7371	30.185	0
Residuals	2281	55.6973	0.0244	NA	NA

#F-statistic: 30.2 #Degrees of freedom: 4 #p-value: <2e-16 (very small) #Decision (reject or fail to reject H₀): Since p < 0.05, we reject H₀ #Statistical conclusion in words: There is statistically significant evidence that bone marrow density differs across at least ethnicities

# Step 4 - Assumption Checks 
#A. Normality: - Create Q-Q plot - Interpret whether residuals appear normally distributed
par(mfrow = c(2, 2))
plot(anova_model, which = 1:4)

#The values appear to fall closely on the horizontal line, suggesting a normal distribution.

#B. Equal Variances: - Conduct Levene’s test - Report test statistic and p-value - State conclusion about homogeneity of variance
levene_test <- leveneTest(BMD ~ Race, data = data_1)
print(levene_test)

## Levene's Test for Homogeneity of Variance (center = median)
##         Df F value Pr(>F)
## group    4  1.5728 0.1788
##       2281

#Fail to reject equal variances (p > 0.05). A non-significant result (p ≥ 0.05) for Levene’s test indicates homogeneous variances across ethnic groups, supporting the equal variance assumption underlying ANOVA.

# Step 5 - Post-hoc Comparisons
#Conduct Tukey HSD test
tukey_results <- TukeyHSD(anova_model)
print(tukey_results)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = BMD ~ Race, data = data_1)
## 
## $Race
##                                                diff          lwr         upr
## Other Hispanic-Mexican American        -0.004441273 -0.042644130  0.03376158
## Non-Hispanic, White-Mexican American   -0.062377249 -0.092852618 -0.03190188
## Non-Hispanic Black-Mexican American     0.022391619 -0.010100052  0.05488329
## Other-Mexican American                 -0.053319344 -0.087357253 -0.01928144
## Non-Hispanic, White-Other Hispanic     -0.057935976 -0.088934694 -0.02693726
## Non-Hispanic Black-Other Hispanic       0.026832892 -0.006150151  0.05981593
## Other-Other Hispanic                   -0.048878071 -0.083385341 -0.01437080
## Non-Hispanic Black-Non-Hispanic, White  0.084768868  0.061164402  0.10837333
## Other-Non-Hispanic, White               0.009057905 -0.016633367  0.03474918
## Other-Non-Hispanic Black               -0.075710963 -0.103764519 -0.04765741
##                                            p adj
## Other Hispanic-Mexican American        0.9978049
## Non-Hispanic, White-Mexican American   0.0000003
## Non-Hispanic Black-Mexican American    0.3276613
## Other-Mexican American                 0.0001919
## Non-Hispanic, White-Other Hispanic     0.0000036
## Non-Hispanic Black-Other Hispanic      0.1722466
## Other-Other Hispanic                   0.0010720
## Non-Hispanic Black-Non-Hispanic, White 0.0000000
## Other-Non-Hispanic, White              0.8719109
## Other-Non-Hispanic Black               0.0000000

#Create formatted table
tukey_summary <- as.data.frame(tukey_results$Race)
tukey_summary$Comparison <- rownames(tukey_summary)
tukey_summary <- tukey_summary[, c("Comparison", "diff", "lwr", "upr", "p adj")]

# Add interpretation columns
tukey_summary$Significant <- ifelse(tukey_summary$`p adj` < 0.05, "Yes", "No")
tukey_summary$Direction <- ifelse(
  tukey_summary$Significant == "Yes",
  ifelse(tukey_summary$diff > 0, "First group higher", "Second group higher"),
  "No difference"
)

kable(tukey_summary, digits = 4,
      caption = "Tukey HSD post-hoc comparisons with 95% confidence intervals",
      format = "html") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE)

Tukey HSD post-hoc comparisons with 95% confidence intervals
	Comparison	diff	lwr	upr	p adj	Significant	Direction
Other Hispanic-Mexican American	Other Hispanic-Mexican American	-0.0044	-0.0426	0.0338	0.9978	No	No difference
Non-Hispanic, White-Mexican American	Non-Hispanic, White-Mexican American	-0.0624	-0.0929	-0.0319	0.0000	Yes	Second group higher
Non-Hispanic Black-Mexican American	Non-Hispanic Black-Mexican American	0.0224	-0.0101	0.0549	0.3277	No	No difference
Other-Mexican American	Other-Mexican American	-0.0533	-0.0874	-0.0193	0.0002	Yes	Second group higher
Non-Hispanic, White-Other Hispanic	Non-Hispanic, White-Other Hispanic	-0.0579	-0.0889	-0.0269	0.0000	Yes	Second group higher
Non-Hispanic Black-Other Hispanic	Non-Hispanic Black-Other Hispanic	0.0268	-0.0062	0.0598	0.1722	No	No difference
Other-Other Hispanic	Other-Other Hispanic	-0.0489	-0.0834	-0.0144	0.0011	Yes	Second group higher
Non-Hispanic Black-Non-Hispanic, White	Non-Hispanic Black-Non-Hispanic, White	0.0848	0.0612	0.1084	0.0000	Yes	First group higher
Other-Non-Hispanic, White	Other-Non-Hispanic, White	0.0091	-0.0166	0.0347	0.8719	No	No difference
Other-Non-Hispanic Black	Other-Non-Hispanic Black	-0.0757	-0.1038	-0.0477	0.0000	Yes	Second group higher

#Non-Hispanic, White and Mexican American ethnic groups significantly differ, with Mexican American having higher BMD #Other and Mexican American ethnic groups significantly differ, with Mexican American having higher BMD #Non-Hispanic, White and Other Hispanic ethnic groups significantly differ, with Other Hispanic having higher BMD #Other and Other Hispanic ethnic groups significantly differ, with Other having higher BMD #Non-Hispanic, Black and Non Hispanic, White ethnic groups significantly differ, with Non-Hispanic, Black having higher BMD #Other and Non-Hispanic Black ethnic groups significantly differ, with Non-Hispanic, Black having higher BMD

# Step 6 - Effect Size
# Calculate eta-squared: eta-squared = SS_between / SS_total
ss_treatment <- anova_table[[1]]$`Sum Sq`[1]
ss_total <- sum(anova_table[[1]]$`Sum Sq`)
eta_squared <- ss_treatment / ss_total

#Interpret using Cohen’s guidelines #The η² = 0.0503 (about 5%). This is a small effect size, meaning ethnicity explains only 5% of BMD variance. While the F-test suggests significance, the magnitude of the association is modest with other unmeasured factors explaining the remaining 95%

PART 2 Correlation Analysis

#Step 1 - Create new calculated variables
data_2 <- data_1 %>%
  mutate(calcium = ifelse(is.na(calcium), 0, calcium)) %>%
  mutate(supp_calcium = ifelse(is.na(supp_calcium), 0, supp_calcium)) %>%
  mutate(diet_vitamin_D = ifelse(is.na(diet_vitamin_D), 0, diet_vitamin_D)) %>%
  mutate(supp_vitamin_D = ifelse(is.na(supp_vitamin_D), 0, supp_vitamin_D)) 
         
data_2$calcium_total <- (data_2$calcium + data_2$supp_calcium)
data_2$vitamin_d_total <- (data_2$diet_vitamin_D + data_2$supp_vitamin_D)

#Step 2 - Apply transformations

#Create histograms for continuous variables (BMI, calcium, vitamin D, MET)
#Decide on transformations based on skewness
#Document reasoning

ggplot(data=data_2, aes(BMI)) +
  geom_histogram(bins=10)

## Warning: Removed 11 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(data=data_2, aes(calcium_total)) +
  geom_histogram(bins=10)

ggplot(data=data_2, aes(vitamin_d_total)) +
  geom_histogram(bins=10)

ggplot(data=data_2, aes(Physical_activity)) +
  geom_histogram(bins=10)

## Warning: Removed 698 rows containing non-finite outside the scale range
## (`stat_bin()`).

#Transformations: #Log transformation: BMI, calcium (reduce right skewedness and normalize data) #Square root: MET, vitamin D (handles zeros better)

data_3 <- data_2
data_3$log_tot_calcium <- log(data_2$calcium_total)
data_3$log_BMI <- log(data_2$BMI)
data_3$sqrt_tot_vitamin_D <- sqrt(data_2$vitamin_d_total)
data_3$sqrt_Physical_activity <-sqrt(data_2$Physical_activity)

ggplot(data=data_3, aes(log_tot_calcium)) +
  geom_histogram(bins=10)

## Warning: Removed 93 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(data=data_3, aes(log_BMI)) +
  geom_histogram(bins=10)

## Warning: Removed 11 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(data=data_3, aes(sqrt_tot_vitamin_D)) +
  geom_histogram(bins=10)

ggplot(data=data_3, aes(sqrt_Physical_activity)) +
  geom_histogram(bins=10)

## Warning: Removed 698 rows containing non-finite outside the scale range
## (`stat_bin()`).

#Step 3: Correlation Tables
# Function to calculate Pearson correlation for a pair of variables
corr_pair <- function(data, x, y) {
# Select variables and remove missing values
  d <- data %>%
    select(all_of(c(x, y))) %>%
    drop_na()
  
# Need at least 3 observations for correlation
  if(nrow(d) < 3) {
    return(tibble(
      x = x,
      y = y,
      n = nrow(d),
      estimate = NA_real_,
      p_value = NA_real_
    ))
  }
  
# Calculate Pearson correlation
  ct <- cor.test(d[[x]], d[[y]], method = "pearson")
  
  # Return tidy results
  tibble(
    x = x,
    y = y,
    n = nrow(d),
    estimate = unname(ct$estimate),
    p_value = ct$p.value
  )}

#Use the provided corr_pair() helper function
#Each table must include: correlation coefficient (r), p-value, sample size (n)
#Table A: Predictors vs. Correlations between: - Age (RIDAGEYR) vs. BMD - BMI (log-transformed) vs. BMD - Total calcium
#(log-transformed) vs. BMD - Total vitamin D (sqrt-transformed) vs. BMD

tableA <- bind_rows(
  corr_pair(data_3, "Age", "BMD"),
  corr_pair(data_3, "log_BMI", "BMD"),
  corr_pair(data_3, "log_tot_calcium", "BMD"),
  corr_pair(data_3, "sqrt_tot_vitamin_D", "BMD")
) %>%
  mutate(
  estimate = round(estimate, 3),
  p_value = signif(p_value, 3)
)

tableA %>% kable()

x	y	n	estimate	p_value
Age	BMD	2286	-0.232	0.0000
log_BMI	BMD	2275	0.425	0.0000
log_tot_calcium	BMD	2286	NaN	NaN
sqrt_tot_vitamin_D	BMD	2286	-0.051	0.0151

#Age, BMI, and total vitamin D were significantly correlated with BMD. While BMI was moderately positively correlated with BMD, age and vitamin D had weak negative associations.

#Table B: Predictors vs. Exposure (Physical Activity) — 10 points #Correlations between: - Age vs. MET (sqrt-transformed) - BMI vs. MET - Total calcium vs. MET - Total vitamin D vs. MET

tableB <- bind_rows(
  corr_pair(data_3, "Age", "sqrt_Physical_activity"),
  corr_pair(data_3, "log_BMI", "sqrt_Physical_activity"),
  corr_pair(data_3, "log_tot_calcium", "sqrt_Physical_activity"),
  corr_pair(data_3, "sqrt_tot_vitamin_D", "sqrt_Physical_activity")
) %>%
  mutate(
    estimate = round(estimate, 3),
    p_value = signif(p_value, 3)
  )

tableB %>% kable()

x	y	n	estimate	p_value
Age	sqrt_Physical_activity	1588	-0.105	2.67e-05
log_BMI	sqrt_Physical_activity	1582	0.009	7.32e-01
log_tot_calcium	sqrt_Physical_activity	1588	NaN	NaN
sqrt_tot_vitamin_D	sqrt_Physical_activity	1588	-0.002	9.34e-01

#Which variables are associated with physical activity levels? #Age was the only variable found to be significantly correlated with physical activity levels; however, age had a weak negative association.

#Table C: Predictors vs. Each Other — 10 points #All pairwise correlations among: - Age - BMI (transformed) - Total calcium (transformed) - Total vitamin D (transformed) #Hint: Use combn() to generate pairs, then map_dfr() with corr_pair() #Use combn(predictors, 2, simplify = FALSE) to generate pairs, then map_dfr() to apply corr_pair() to each pair.

preds <- c("Age", "log_BMI", "log_tot_calcium", "sqrt_tot_vitamin_D")
pairs <- combn(preds, 2, simplify = FALSE)
tableC <- map_dfr(pairs, ~corr_pair(data_3, .x[1], .x[2])) %>%
  mutate(
  estimate = round(estimate, 3),
  p_value = signif(p_value, 3))

tableC %>% kable()

x	y	n	estimate	p_value
Age	log_BMI	2275	-0.093	9.3e-06
Age	log_tot_calcium	2286	NaN	NaN
Age	sqrt_tot_vitamin_D	2286	0.146	0.0e+00
log_BMI	log_tot_calcium	2275	NaN	NaN
log_BMI	sqrt_tot_vitamin_D	2275	0.024	2.5e-01
log_tot_calcium	sqrt_tot_vitamin_D	2286	NaN	NaN

#Both age and BMI and age and vitamin D are significantly correlated, suggesting potential multicollinearity concerns. However, these correlations are weak and follow-up VIF testing would be beneficial.

Reflection Questions

#A. Comparing Methods (3 points) - When to use ANOVA vs. correlation? - Key differences in what they tell us - Example research questions for each #ANOVA should be used when one variable is categorical with a continuous outcome, whereas correlation should be used for continuous or ordinal variables. #Correlation will tell us that variables are associated with eachother, but this does not mean causally association. The classic association between ice cream sales and drowning deaths is a key example. #ANOVA compares variable mean changes across different groups. For example, ANOVA can compare vaccination rates across geographic regions.

#B. Assumptions & Limitations (4 points) - Which assumptions were challenging? - How might violations affect conclusions? - Limitations of cross-sectional correlation for causal inference #The strength of the correlations could affect our conclusions. In addition, regression analysis is required to determine if any correlations are spurious. #Testing for VIF is important to test for multicollinearity.

#C. R Programming Challenge (3 points) - What was most difficult? - How did you solve it? - What skill did you strengthen? #Writing this Rscript was generally not too challenging. I completed part 1 and half of part 2 of this script prior to the answer key being released without issue. #I did fail to convert a cor.test into a table using prior code that the professor used to convert an anova result so this would be helpful code to cover in class.

EPI553_HW01_GoldmanMatthew

2026-02-20

PART 0 Prepare the dataset

PART 1 One-Way ANOVA

PART 2 Correlation Analysis

Reflection Questions