BMD <- read.csv("data/bmd.csv")
#Select key variables and rename
BMD_clean <- BMD %>%
select(SEQN, DXXOFBMD, RIDRETH1, RIAGENDR, RIDAGEYR, BMXBMI, smoker, calcium, DSQTCALC, vitd, DSQTVD, totmet) %>%
rename(BMD = DXXOFBMD,
Race = RIDRETH1,
Sex = RIAGENDR,
Age = RIDAGEYR,
BMI = BMXBMI,
supp_calcium = DSQTCALC,
diet_vitamin_D = vitd,
supp_vitamin_D = DSQTVD,
Physical_activity = totmet)
BMD_clean_2 <- BMD_clean %>%
mutate(Race = factor(Race, levels=c(1,2,3,4,5),
labels=c("Mexican American", "Other Hispanic", "Non-Hispanic, White", "Non-Hispanic Black", "Other"))) %>%
mutate(Sex = factor(Sex, levels=c(1,2), labels=c("Male", "Female"))) %>%
mutate(smoker = factor(smoker, levels=c(1,2,3), labels=c("Current", "Past", "Never")))
#Display N before removing missing
print(nrow(BMD_clean_2)) #2898
## [1] 2898
#Percent missing report for BMD and race
mean(is.na(BMD_clean_2$BMD)) * 100 #21%
## [1] 21.11801
mean(is.na(BMD_clean_2$Race)) * 100 #0%
## [1] 0
#Remove missing values from race and BMD
data_1 <- BMD_clean_2 %>%
filter(!is.na(BMD))
#Display N after removing missing
print(nrow(data_1)) #2286
## [1] 2286
#Research Question: Do mean BMD values differ across ethnicity groups?
#Step 1 - Hypotheses #Null Hypothesis (H₀): μ_1 = μ_2 = μ_3 = μ_4 = μ_5 - All five ethnicity groups BMD means are equal) #Alternative Hypothesis (H₁): At least one ethnicity BMD mean differs from the others
#Step 2 - Exploratory Analysis
# Create summary table of BMD by ethnicity
BMD_by_ethnicity <- data_1 %>%
group_by(Race) %>%
summarise(
N = n(),
Mean_BMD = round(mean(BMD, na.rm = TRUE), 2),
SD_BMD = sd(BMD, na.rm = TRUE),
Lower_BMD = (Mean_BMD - SD_BMD),
Upper_BMD = (Mean_BMD + SD_BMD))
summary_table <- BMD_by_ethnicity %>%
select(
Race,
N,
Mean_BMD,
Lower_BMD,
Upper_BMD)
kable(summary_table,
caption = "Bone Marrow Density by Ethnicity",
format = "html") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)
| Race | N | Mean_BMD | Lower_BMD | Upper_BMD |
|---|---|---|---|---|
| Mexican American | 255 | 0.95 | 0.8011257 | 1.098874 |
| Other Hispanic | 244 | 0.95 | 0.7944264 | 1.105574 |
| Non-Hispanic, White | 846 | 0.89 | 0.7302357 | 1.049764 |
| Non-Hispanic Black | 532 | 0.97 | 0.8091685 | 1.130831 |
| Other | 409 | 0.90 | 0.7524478 | 1.047552 |
#Create plot of BMD by ethnicity
ggplot(data=data_1, aes(x= Race, y=BMD)) +
geom_boxplot(fill = "white", colour = "darkblue") +
ylab("Bone Marrow Density") +
ggtitle("Box plot of Bone Marrow Density by Ethnicity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#Step 3 - ANOVA F-test
# Outcome: BMD
# Predictor: Race
# Fit the one-way ANOVA model
anova_model <- aov(BMD ~ Race, data = data_1)
# Display the ANOVA table
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## Race 4 2.95 0.7371 30.18 <2e-16 ***
## Residuals 2281 55.70 0.0244
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_table <- summary(anova_model)
print(anova_table)
## Df Sum Sq Mean Sq F value Pr(>F)
## Race 4 2.95 0.7371 30.18 <2e-16 ***
## Residuals 2281 55.70 0.0244
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_df <- data.frame(
Source = c("Ethnicity", "Residuals"),
DF = c(anova_table[[1]]$Df[1], anova_table[[1]]$Df[2]),
`Sum Sq` = c(anova_table[[1]]$`Sum Sq`[1], anova_table[[1]]$`Sum Sq`[2]),
`Mean Sq` = c(anova_table[[1]]$`Mean Sq`[1], anova_table[[1]]$`Mean Sq`[2]),
`F value` = c(anova_table[[1]]$`F value`[1], NA),
`Pr(>F)` = c(anova_table[[1]]$`Pr(>F)`[1], NA)
)
kable(anova_df, digits = 4,
caption = "Bone Marrow Density by Ethnicity",
col.names = c("Source", "DF", "Sum Sq", "Mean Sq", "F value", "p-value"),
format = "html") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)
| Source | DF | Sum Sq | Mean Sq | F value | p-value |
|---|---|---|---|---|---|
| Ethnicity | 4 | 2.9482 | 0.7371 | 30.185 | 0 |
| Residuals | 2281 | 55.6973 | 0.0244 | NA | NA |
#F-statistic: 30.2 #Degrees of freedom: 4 #p-value: <2e-16 (very small) #Decision (reject or fail to reject H₀): Since p < 0.05, we reject H₀ #Statistical conclusion in words: There is statistically significant evidence that bone marrow density differs across at least ethnicities
# Step 4 - Assumption Checks
#A. Normality: - Create Q-Q plot - Interpret whether residuals appear normally distributed
par(mfrow = c(2, 2))
plot(anova_model, which = 1:4)
#The values appear to fall closely on the horizontal line, suggesting a
normal distribution.
#B. Equal Variances: - Conduct Levene’s test - Report test statistic and p-value - State conclusion about homogeneity of variance
levene_test <- leveneTest(BMD ~ Race, data = data_1)
print(levene_test)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 4 1.5728 0.1788
## 2281
#Fail to reject equal variances (p > 0.05). A non-significant result (p ≥ 0.05) for Levene’s test indicates homogeneous variances across ethnic groups, supporting the equal variance assumption underlying ANOVA.
# Step 5 - Post-hoc Comparisons
#Conduct Tukey HSD test
tukey_results <- TukeyHSD(anova_model)
print(tukey_results)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = BMD ~ Race, data = data_1)
##
## $Race
## diff lwr upr
## Other Hispanic-Mexican American -0.004441273 -0.042644130 0.03376158
## Non-Hispanic, White-Mexican American -0.062377249 -0.092852618 -0.03190188
## Non-Hispanic Black-Mexican American 0.022391619 -0.010100052 0.05488329
## Other-Mexican American -0.053319344 -0.087357253 -0.01928144
## Non-Hispanic, White-Other Hispanic -0.057935976 -0.088934694 -0.02693726
## Non-Hispanic Black-Other Hispanic 0.026832892 -0.006150151 0.05981593
## Other-Other Hispanic -0.048878071 -0.083385341 -0.01437080
## Non-Hispanic Black-Non-Hispanic, White 0.084768868 0.061164402 0.10837333
## Other-Non-Hispanic, White 0.009057905 -0.016633367 0.03474918
## Other-Non-Hispanic Black -0.075710963 -0.103764519 -0.04765741
## p adj
## Other Hispanic-Mexican American 0.9978049
## Non-Hispanic, White-Mexican American 0.0000003
## Non-Hispanic Black-Mexican American 0.3276613
## Other-Mexican American 0.0001919
## Non-Hispanic, White-Other Hispanic 0.0000036
## Non-Hispanic Black-Other Hispanic 0.1722466
## Other-Other Hispanic 0.0010720
## Non-Hispanic Black-Non-Hispanic, White 0.0000000
## Other-Non-Hispanic, White 0.8719109
## Other-Non-Hispanic Black 0.0000000
#Create formatted table
tukey_summary <- as.data.frame(tukey_results$Race)
tukey_summary$Comparison <- rownames(tukey_summary)
tukey_summary <- tukey_summary[, c("Comparison", "diff", "lwr", "upr", "p adj")]
# Add interpretation columns
tukey_summary$Significant <- ifelse(tukey_summary$`p adj` < 0.05, "Yes", "No")
tukey_summary$Direction <- ifelse(
tukey_summary$Significant == "Yes",
ifelse(tukey_summary$diff > 0, "First group higher", "Second group higher"),
"No difference"
)
kable(tukey_summary, digits = 4,
caption = "Tukey HSD post-hoc comparisons with 95% confidence intervals",
format = "html") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)
| Comparison | diff | lwr | upr | p adj | Significant | Direction | |
|---|---|---|---|---|---|---|---|
| Other Hispanic-Mexican American | Other Hispanic-Mexican American | -0.0044 | -0.0426 | 0.0338 | 0.9978 | No | No difference |
| Non-Hispanic, White-Mexican American | Non-Hispanic, White-Mexican American | -0.0624 | -0.0929 | -0.0319 | 0.0000 | Yes | Second group higher |
| Non-Hispanic Black-Mexican American | Non-Hispanic Black-Mexican American | 0.0224 | -0.0101 | 0.0549 | 0.3277 | No | No difference |
| Other-Mexican American | Other-Mexican American | -0.0533 | -0.0874 | -0.0193 | 0.0002 | Yes | Second group higher |
| Non-Hispanic, White-Other Hispanic | Non-Hispanic, White-Other Hispanic | -0.0579 | -0.0889 | -0.0269 | 0.0000 | Yes | Second group higher |
| Non-Hispanic Black-Other Hispanic | Non-Hispanic Black-Other Hispanic | 0.0268 | -0.0062 | 0.0598 | 0.1722 | No | No difference |
| Other-Other Hispanic | Other-Other Hispanic | -0.0489 | -0.0834 | -0.0144 | 0.0011 | Yes | Second group higher |
| Non-Hispanic Black-Non-Hispanic, White | Non-Hispanic Black-Non-Hispanic, White | 0.0848 | 0.0612 | 0.1084 | 0.0000 | Yes | First group higher |
| Other-Non-Hispanic, White | Other-Non-Hispanic, White | 0.0091 | -0.0166 | 0.0347 | 0.8719 | No | No difference |
| Other-Non-Hispanic Black | Other-Non-Hispanic Black | -0.0757 | -0.1038 | -0.0477 | 0.0000 | Yes | Second group higher |
#Non-Hispanic, White and Mexican American ethnic groups significantly differ, with Mexican American having higher BMD #Other and Mexican American ethnic groups significantly differ, with Mexican American having higher BMD #Non-Hispanic, White and Other Hispanic ethnic groups significantly differ, with Other Hispanic having higher BMD #Other and Other Hispanic ethnic groups significantly differ, with Other having higher BMD #Non-Hispanic, Black and Non Hispanic, White ethnic groups significantly differ, with Non-Hispanic, Black having higher BMD #Other and Non-Hispanic Black ethnic groups significantly differ, with Non-Hispanic, Black having higher BMD
# Step 6 - Effect Size
# Calculate eta-squared: eta-squared = SS_between / SS_total
ss_treatment <- anova_table[[1]]$`Sum Sq`[1]
ss_total <- sum(anova_table[[1]]$`Sum Sq`)
eta_squared <- ss_treatment / ss_total
#Interpret using Cohen’s guidelines #The η² = 0.0503 (about 5%). This is a small effect size, meaning ethnicity explains only 5% of BMD variance. While the F-test suggests significance, the magnitude of the association is modest with other unmeasured factors explaining the remaining 95%
#Step 1 - Create new calculated variables
data_2 <- data_1 %>%
mutate(calcium = ifelse(is.na(calcium), 0, calcium)) %>%
mutate(supp_calcium = ifelse(is.na(supp_calcium), 0, supp_calcium)) %>%
mutate(diet_vitamin_D = ifelse(is.na(diet_vitamin_D), 0, diet_vitamin_D)) %>%
mutate(supp_vitamin_D = ifelse(is.na(supp_vitamin_D), 0, supp_vitamin_D))
data_2$calcium_total <- (data_2$calcium + data_2$supp_calcium)
data_2$vitamin_d_total <- (data_2$diet_vitamin_D + data_2$supp_vitamin_D)
#Step 2 - Apply transformations
#Create histograms for continuous variables (BMI, calcium, vitamin D, MET)
#Decide on transformations based on skewness
#Document reasoning
ggplot(data=data_2, aes(BMI)) +
geom_histogram(bins=10)
## Warning: Removed 11 rows containing non-finite outside the scale range
## (`stat_bin()`).
ggplot(data=data_2, aes(calcium_total)) +
geom_histogram(bins=10)
ggplot(data=data_2, aes(vitamin_d_total)) +
geom_histogram(bins=10)
ggplot(data=data_2, aes(Physical_activity)) +
geom_histogram(bins=10)
## Warning: Removed 698 rows containing non-finite outside the scale range
## (`stat_bin()`).
#Transformations: #Log transformation: BMI, calcium (reduce right
skewedness and normalize data) #Square root: MET, vitamin D (handles
zeros better)
data_3 <- data_2
data_3$log_tot_calcium <- log(data_2$calcium_total)
data_3$log_BMI <- log(data_2$BMI)
data_3$sqrt_tot_vitamin_D <- sqrt(data_2$vitamin_d_total)
data_3$sqrt_Physical_activity <-sqrt(data_2$Physical_activity)
ggplot(data=data_3, aes(log_tot_calcium)) +
geom_histogram(bins=10)
## Warning: Removed 93 rows containing non-finite outside the scale range
## (`stat_bin()`).
ggplot(data=data_3, aes(log_BMI)) +
geom_histogram(bins=10)
## Warning: Removed 11 rows containing non-finite outside the scale range
## (`stat_bin()`).
ggplot(data=data_3, aes(sqrt_tot_vitamin_D)) +
geom_histogram(bins=10)
ggplot(data=data_3, aes(sqrt_Physical_activity)) +
geom_histogram(bins=10)
## Warning: Removed 698 rows containing non-finite outside the scale range
## (`stat_bin()`).
#Step 3: Correlation Tables
# Function to calculate Pearson correlation for a pair of variables
corr_pair <- function(data, x, y) {
# Select variables and remove missing values
d <- data %>%
select(all_of(c(x, y))) %>%
drop_na()
# Need at least 3 observations for correlation
if(nrow(d) < 3) {
return(tibble(
x = x,
y = y,
n = nrow(d),
estimate = NA_real_,
p_value = NA_real_
))
}
# Calculate Pearson correlation
ct <- cor.test(d[[x]], d[[y]], method = "pearson")
# Return tidy results
tibble(
x = x,
y = y,
n = nrow(d),
estimate = unname(ct$estimate),
p_value = ct$p.value
)}
#Use the provided corr_pair() helper function
#Each table must include: correlation coefficient (r), p-value, sample size (n)
#Table A: Predictors vs. Correlations between: - Age (RIDAGEYR) vs. BMD - BMI (log-transformed) vs. BMD - Total calcium
#(log-transformed) vs. BMD - Total vitamin D (sqrt-transformed) vs. BMD
tableA <- bind_rows(
corr_pair(data_3, "Age", "BMD"),
corr_pair(data_3, "log_BMI", "BMD"),
corr_pair(data_3, "log_tot_calcium", "BMD"),
corr_pair(data_3, "sqrt_tot_vitamin_D", "BMD")
) %>%
mutate(
estimate = round(estimate, 3),
p_value = signif(p_value, 3)
)
tableA %>% kable()
| x | y | n | estimate | p_value |
|---|---|---|---|---|
| Age | BMD | 2286 | -0.232 | 0.0000 |
| log_BMI | BMD | 2275 | 0.425 | 0.0000 |
| log_tot_calcium | BMD | 2286 | NaN | NaN |
| sqrt_tot_vitamin_D | BMD | 2286 | -0.051 | 0.0151 |
#Age, BMI, and total vitamin D were significantly correlated with BMD. While BMI was moderately positively correlated with BMD, age and vitamin D had weak negative associations.
#Table B: Predictors vs. Exposure (Physical Activity) — 10 points #Correlations between: - Age vs. MET (sqrt-transformed) - BMI vs. MET - Total calcium vs. MET - Total vitamin D vs. MET
tableB <- bind_rows(
corr_pair(data_3, "Age", "sqrt_Physical_activity"),
corr_pair(data_3, "log_BMI", "sqrt_Physical_activity"),
corr_pair(data_3, "log_tot_calcium", "sqrt_Physical_activity"),
corr_pair(data_3, "sqrt_tot_vitamin_D", "sqrt_Physical_activity")
) %>%
mutate(
estimate = round(estimate, 3),
p_value = signif(p_value, 3)
)
tableB %>% kable()
| x | y | n | estimate | p_value |
|---|---|---|---|---|
| Age | sqrt_Physical_activity | 1588 | -0.105 | 2.67e-05 |
| log_BMI | sqrt_Physical_activity | 1582 | 0.009 | 7.32e-01 |
| log_tot_calcium | sqrt_Physical_activity | 1588 | NaN | NaN |
| sqrt_tot_vitamin_D | sqrt_Physical_activity | 1588 | -0.002 | 9.34e-01 |
#Which variables are associated with physical activity levels? #Age was the only variable found to be significantly correlated with physical activity levels; however, age had a weak negative association.
#Table C: Predictors vs. Each Other — 10 points #All pairwise correlations among: - Age - BMI (transformed) - Total calcium (transformed) - Total vitamin D (transformed) #Hint: Use combn() to generate pairs, then map_dfr() with corr_pair() #Use combn(predictors, 2, simplify = FALSE) to generate pairs, then map_dfr() to apply corr_pair() to each pair.
preds <- c("Age", "log_BMI", "log_tot_calcium", "sqrt_tot_vitamin_D")
pairs <- combn(preds, 2, simplify = FALSE)
tableC <- map_dfr(pairs, ~corr_pair(data_3, .x[1], .x[2])) %>%
mutate(
estimate = round(estimate, 3),
p_value = signif(p_value, 3))
tableC %>% kable()
| x | y | n | estimate | p_value |
|---|---|---|---|---|
| Age | log_BMI | 2275 | -0.093 | 9.3e-06 |
| Age | log_tot_calcium | 2286 | NaN | NaN |
| Age | sqrt_tot_vitamin_D | 2286 | 0.146 | 0.0e+00 |
| log_BMI | log_tot_calcium | 2275 | NaN | NaN |
| log_BMI | sqrt_tot_vitamin_D | 2275 | 0.024 | 2.5e-01 |
| log_tot_calcium | sqrt_tot_vitamin_D | 2286 | NaN | NaN |
#Both age and BMI and age and vitamin D are significantly correlated, suggesting potential multicollinearity concerns. However, these correlations are weak and follow-up VIF testing would be beneficial.
#A. Comparing Methods (3 points) - When to use ANOVA vs. correlation? - Key differences in what they tell us - Example research questions for each #ANOVA should be used when one variable is categorical with a continuous outcome, whereas correlation should be used for continuous or ordinal variables. #Correlation will tell us that variables are associated with eachother, but this does not mean causally association. The classic association between ice cream sales and drowning deaths is a key example. #ANOVA compares variable mean changes across different groups. For example, ANOVA can compare vaccination rates across geographic regions.
#B. Assumptions & Limitations (4 points) - Which assumptions were challenging? - How might violations affect conclusions? - Limitations of cross-sectional correlation for causal inference #The strength of the correlations could affect our conclusions. In addition, regression analysis is required to determine if any correlations are spurious. #Testing for VIF is important to test for multicollinearity.
#C. R Programming Challenge (3 points) - What was most difficult? - How did you solve it? - What skill did you strengthen? #Writing this Rscript was generally not too challenging. I completed part 1 and half of part 2 of this script prior to the answer key being released without issue. #I did fail to convert a cor.test into a table using prior code that the professor used to convert an anova result so this would be helpful code to cover in class.