# 1. Load necessary libraries
library(tidyverse) # For data manipulation and visualization
library(knitr) # For nice tables
library(car) # For Levene's test
library(NHANES) # NHANES dataset
library(dplyr)
library (ggplot2)
library(kableExtra)
library(broom)
library(readr)
library(ggstats)
# Set seed
set.seed(553)# Set to the folder containing bmd.csv
setwd("C:/Users/God's Icon/Desktop/553-Statistical Inference/553 Lab Codes/data")
# 2. Import CSV
bmd_data <- read_csv("bmd.csv")
# View first few rows
head(bmd_data)## # A tibble: 6 × 14
## SEQN RIAGENDR RIDAGEYR RIDRETH1 BMXBMI smoker totmet metcat DXXOFBMD tbmdcat
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 93705 2 66 4 31.7 2 240 0 1.06 0
## 2 93708 2 66 5 23.7 3 120 0 0.801 1
## 3 93709 2 75 4 38.9 1 720 1 0.88 0
## 4 93711 1 56 5 21.3 3 840 1 0.851 1
## 5 93713 1 67 3 23.5 1 360 0 0.778 1
## 6 93714 2 54 4 39.9 2 NA NA 0.994 0
## # ℹ 4 more variables: calcium <dbl>, vitd <dbl>, DSQTVD <dbl>, DSQTCALC <dbl>
## [1] "SEQN" "RIAGENDR" "RIDAGEYR" "RIDRETH1" "BMXBMI" "smoker"
## [7] "totmet" "metcat" "DXXOFBMD" "tbmdcat" "calcium" "vitd"
## [13] "DSQTVD" "DSQTCALC"
## SEQN RIAGENDR RIDAGEYR RIDRETH1
## Min. : 93705 Min. :1.000 Min. :50.00 Min. :1.000
## 1st Qu.: 95903 1st Qu.:1.000 1st Qu.:58.00 1st Qu.:3.000
## Median : 98233 Median :2.000 Median :64.00 Median :3.000
## Mean : 98257 Mean :1.506 Mean :65.21 Mean :3.257
## 3rd Qu.:100581 3rd Qu.:2.000 3rd Qu.:73.00 3rd Qu.:4.000
## Max. :102952 Max. :2.000 Max. :80.00 Max. :5.000
##
## BMXBMI smoker totmet metcat
## Min. :14.20 Min. :1.000 Min. : 0 Min. :0.0000
## 1st Qu.:25.20 1st Qu.:2.000 1st Qu.: 240 1st Qu.:0.0000
## Median :28.70 Median :3.000 Median : 480 Median :0.0000
## Mean :29.76 Mean :2.381 Mean :1015 Mean :0.7048
## 3rd Qu.:33.40 3rd Qu.:3.000 3rd Qu.:1430 3rd Qu.:1.0000
## Max. :84.40 Max. :3.000 Max. :9120 Max. :2.0000
## NA's :64 NA's :2 NA's :964 NA's :964
## DXXOFBMD tbmdcat calcium vitd
## Min. :0.3700 Min. :0.0000 Min. : 55.0 Min. : 0.000
## 1st Qu.:0.8123 1st Qu.:0.0000 1st Qu.: 525.0 1st Qu.: 1.750
## Median :0.9165 Median :0.0000 Median : 773.0 Median : 3.200
## Mean :0.9223 Mean :0.3985 Mean : 840.5 Mean : 4.471
## 3rd Qu.:1.0320 3rd Qu.:1.0000 3rd Qu.:1049.0 3rd Qu.: 5.500
## Max. :1.5500 Max. :2.0000 Max. :5199.0 Max. :57.900
## NA's :612 NA's :612 NA's :293 NA's :293
## DSQTVD DSQTCALC
## Min. : 0.004 Min. : 0.74
## 1st Qu.: 15.709 1st Qu.: 93.33
## Median : 25.000 Median : 211.67
## Mean : 49.851 Mean : 357.98
## 3rd Qu.: 50.000 3rd Qu.: 500.00
## Max. :2570.000 Max. :3220.00
## NA's :1515 NA's :1633
Create analysis dataset
# 3. Convert RIDRETH1 to factor with meaningful ethnicity labels
bmd_data$RIDRETH1 <- factor(bmd_data$RIDRETH1,
levels = c(1, 2, 3, 4, 5),
labels = c("Mexican American",
"Other Hispanic",
"Non-Hispanic White",
"Non-Hispanic Black",
"Other"))
# Check the conversion worked
table(bmd_data$RIDRETH1)##
## Mexican American Other Hispanic Non-Hispanic White Non-Hispanic Black
## 331 286 1086 696
## Other
## 499
## Factor w/ 5 levels "Mexican American",..: 4 5 4 5 3 4 5 5 1 3 ...
## [1] Non-Hispanic Black Other Non-Hispanic Black Other
## [5] Non-Hispanic White Non-Hispanic Black Other Other
## [9] Mexican American Non-Hispanic White Non-Hispanic White Mexican American
## [13] Other Non-Hispanic Black Other Hispanic Non-Hispanic White
## [17] Other Hispanic Non-Hispanic Black Non-Hispanic Black Non-Hispanic White
## 5 Levels: Mexican American Other Hispanic ... Other
# 4. Convert RIAGENDR to factor (Male/Female) labels
bmd_data$RIAGENDR <- factor(bmd_data$RIAGENDR,
levels = c(1, 2),
labels = c("Male", "Female"))
# Check the conversion
head(bmd_data$RIAGENDR)## [1] Female Female Female Male Male Female
## Levels: Male Female
##
## Male Female
## 1431 1467
# 5. Convert smoker to factor with Current/Past/Never labels
bmd_data$smoker <- factor(bmd_data$smoker,
levels = c(1, 2, 3),
labels = c("Current", "Past", "Never"))
# Check the conversion
head(bmd_data$smoker)## [1] Past Never Current Never Current Past
## Levels: Current Past Never
##
## Current Past Never
## 449 894 1553
## Total sample size: 2898
## [1] 2898
# 7. Create analytic dataset removing missing BMD and ethnicity
# First, check which column is your BMD variable (likely DXXFEBMD or similar)
names(bmd_data)## [1] "SEQN" "RIAGENDR" "RIDAGEYR" "RIDRETH1" "BMXBMI" "smoker"
## [7] "totmet" "metcat" "DXXOFBMD" "tbmdcat" "calcium" "vitd"
## [13] "DSQTVD" "DSQTCALC"
# Remove rows with missing BMD and ethnicity
# Replace 'BMD_column_name' with your actual BMD column name
analytic_data <- bmd_data[complete.cases(bmd_data$DXXOFBMD, bmd_data$RIDRETH1), ]
# Alternative using na.omit for specific columns
analytic_data <- bmd_data[!is.na(bmd_data$DXXOFBMD) & !is.na(bmd_data$RIDRETH1), ]
analytic_data <- bmd_data %>%
filter(!is.na(DXXOFBMD) & !is.na(RIDRETH1))
# 8. Report final analytic sample size
final_n <- nrow(analytic_data)
cat("Final analytic sample size:", final_n, "\n")## Final analytic sample size: 2286
## Number excluded: 612
** Research Question: Do mean BMD values differ across ethnicity groups? **
(H_0: mu_1 = mu_2 = mu_3 = mu_4 = mu_5) - Plain language (what this means for bone health)
Null Hypothesis (H₀): μ_1 = μ_2 = μ_3 = μ_4 =
μ_5
(All five population means are equal)
Alternative Hypothesis (H₁): At least one population mean differs from the others
Significance level: α = 0.05
#Fit the ONE-WAY ANOVA model
anova_model <- aov(DXXOFBMD ~ RIDRETH1, data = analytic_data)
# View the ANOVA results
summary(anova_model)## Df Sum Sq Mean Sq F value Pr(>F)
## RIDRETH1 4 2.95 0.7371 30.18 <2e-16 ***
## Residuals 2281 55.70 0.0244
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
f = 4 Mean = 0.7371 Pr(>F) = <2e-16 F-statistic = 30.18
Interpretation:
group_by(), summarize(), kable()) - Plot: Boxplot or violin plot with jittered points showing BMD distributions
# Create summary table
bmd_summary_stats <- analytic_data %>%
group_by(RIDRETH1) %>%
summarize(
n = n(),
Mean = round(mean(DXXOFBMD, na.rm = TRUE), 3),
SD = round(sd(DXXOFBMD, na.rm = TRUE), 3)
)
# Display as formatted table
kable(bmd_summary_stats,
caption = "Bone Mineral Density by Ethnicity",
col.names = c("Ethnicity", "n", "Mean BMD", "SD"))| Ethnicity | n | Mean BMD | SD |
|---|---|---|---|
| Mexican American | 255 | 0.950 | 0.149 |
| Other Hispanic | 244 | 0.946 | 0.156 |
| Non-Hispanic White | 846 | 0.888 | 0.160 |
| Non-Hispanic Black | 532 | 0.973 | 0.161 |
| Other | 409 | 0.897 | 0.148 |
# Boxplot with jittered points
ggplot(analytic_data,
aes(x = RIDRETH1, y = DXXOFBMD, fill = RIDRETH1)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.1, size = 0.5) +
scale_fill_brewer(palette = "Set2") +
labs(
title = "Bone Mineral Density by Ethnicity",
subtitle = "NHANES Data",
x = "Ethnicity",
y = "BMD (g/cm²)",
fill = "Ethnicity"
) +
theme_minimal(base_size = 12) +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1)
)What the plot tells us:
All five ethnic groups have very similar bone mineral density (BMD) values.
The boxes are all about the same height and position - this means most people in each group have similar BMD values (around 0.9-1.0 g/cm²)
Non-Hispanic Black people have slightly higher BMD - their box sits a bit higher than the others.
Mexican Americans have slightly lower BMD - their box sits a bit lower than the others.
The differences are small - you can see the boxes overlap a lot, meaning the groups are more similar than different Lots of dots (data points) - this shows we have many people in each group, making our results more reliable
# Fit ANOVA model
anova_model <- aov(DXXOFBMD ~ RIDRETH1, data = analytic_data)
# Get ANOVA results
anova_results <- summary(anova_model)
# Create formatted table
anova_table <- data.frame(
Source = c("Ethnicity (Between Groups)", "Residuals (Within Groups)"),
Df = c(4, 2281),
`Sum Sq` = c(2.95, 55.70),
`Mean Sq` = c(0.7371, 0.0244),
`F value` = c(30.18, NA),
`p-value` = c("<0.001", NA)
)
kable(anova_table,
caption = "One-Way ANOVA Results: BMD by Ethnicity",
digits = 4,
col.names = c("Source", "df", "Sum of Squares", "Mean Square", "F-statistic", "p-value"))| Source | df | Sum of Squares | Mean Square | F-statistic | p-value |
|---|---|---|---|---|---|
| Ethnicity (Between Groups) | 4 | 2.95 | 0.7371 | 30.18 | <0.001 |
| Residuals (Within Groups) | 2281 | 55.70 | 0.0244 | NA | NA |
## Df Sum Sq Mean Sq F value Pr(>F)
## RIDRETH1 4 2.95 0.7371 30.18 <2e-16 ***
## Residuals 2281 55.70 0.0244
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Df = 4 Mean = 0.7371 Pr(>F) = <2e-16 F-statistic = 30.18 f = 4 Mean = 0.7371 Pr(>F) = <2e-16 F-statistic = 30.18
Interpretation:
Decision and Interpretation: Decision: We reject the null hypothesis (p < 0.001).
Interpretation: There is statistically significant evidence that mean bone mineral density differs across ethnicity groups (F(4, 2281) = 30.18, p < 0.001). At least one ethnic group has a significantly different mean BMD compared to the others.
#Step 4: Assumption Checks
ANOVA Assumptions:
1.Residuals vs Fitted: Points show vertical bands at distinct fitted values (corresponding to the 5 ethnic groups) with random scatter around zero within each band. No clear pattern or funnel shape → Good! This suggests the model fits reasonably well. 2. Q-Q Plot: Points follow the diagonal line quite well through the middle, with slight deviations at the extreme ends (tails). This is acceptable, especially with a large sample size → Normality assumption is reasonable. 3. Scale-Location: Shows vertical bands at each fitted value with relatively consistent spread. The pattern shows similar variability across groups → Equal variance assumption is reasonable. 4. Residuals vs Leverage: All points fall well within Cook’s distance boundaries (the dashed lines). No points extend beyond these lines → No highly influential outliers that would unduly affect our ANOVA results.
# Conduct Levene's test
levene_test <- leveneTest(DXXOFBMD ~ RIDRETH1, data = analytic_data)
# Display results
print(levene_test)## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 4 1.5728 0.1788
## 2281
# Create formatted table
levene_table <- data.frame(
Test = "Levene's Test",
Df1 = levene_test$Df[1],
Df2 = levene_test$Df[2],
`F value` = round(levene_test$`F value`[1], 3),
`p-value` = round(levene_test$`Pr(>F)`[1], 4)
)
kable(levene_table,
caption = "Levene's Test for Homogeneity of Variance",
col.names = c("Test", "df1", "df2", "F-statistic", "p-value"))| Test | df1 | df2 | F-statistic | p-value |
|---|---|---|---|---|
| Levene’s Test | 4 | 2281 | 1.573 | 0.1788 |
Levene’s Test Interpretation: Test Results: F-statistic: 1.573 Degrees of freedom: df1 = 4, df2 = 2281 p-value: 0.1788
Interpretation: Since p = 0.179 > 0.05, we fail to reject the null hypothesis of equal variances. This means the equal variance assumption is met - the variability in BMD is similar across all five ethnic groups.
Conclusion: The homogeneity of variance assumption required for ANOVA is satisfied. The Scale-Location plot showed consistent spread, we can be confident that our ANOVA assumptions are valid.
#Step 5: Post-hoc Comparisons (Tukey HSD)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = DXXOFBMD ~ RIDRETH1, data = analytic_data)
##
## $RIDRETH1
## diff lwr upr
## Other Hispanic-Mexican American -0.004441273 -0.042644130 0.03376158
## Non-Hispanic White-Mexican American -0.062377249 -0.092852618 -0.03190188
## Non-Hispanic Black-Mexican American 0.022391619 -0.010100052 0.05488329
## Other-Mexican American -0.053319344 -0.087357253 -0.01928144
## Non-Hispanic White-Other Hispanic -0.057935976 -0.088934694 -0.02693726
## Non-Hispanic Black-Other Hispanic 0.026832892 -0.006150151 0.05981593
## Other-Other Hispanic -0.048878071 -0.083385341 -0.01437080
## Non-Hispanic Black-Non-Hispanic White 0.084768868 0.061164402 0.10837333
## Other-Non-Hispanic White 0.009057905 -0.016633367 0.03474918
## Other-Non-Hispanic Black -0.075710963 -0.103764519 -0.04765741
## p adj
## Other Hispanic-Mexican American 0.9978049
## Non-Hispanic White-Mexican American 0.0000003
## Non-Hispanic Black-Mexican American 0.3276613
## Other-Mexican American 0.0001919
## Non-Hispanic White-Other Hispanic 0.0000036
## Non-Hispanic Black-Other Hispanic 0.1722466
## Other-Other Hispanic 0.0010720
## Non-Hispanic Black-Non-Hispanic White 0.0000000
## Other-Non-Hispanic White 0.8719109
## Other-Non-Hispanic Black 0.0000000
| Comparison | Mean Difference | 95% CI Lower | 95% CI Upper | Adjusted p-value | Significant | |
|---|---|---|---|---|---|---|
| Other Hispanic-Mexican American | Other Hispanic-Mexican American | -0.0044 | -0.0426 | 0.0338 | 0.9978 | No |
| Non-Hispanic White-Mexican American | Non-Hispanic White-Mexican American | -0.0624 | -0.0929 | -0.0319 | 0.0000 | Yes |
| Non-Hispanic Black-Mexican American | Non-Hispanic Black-Mexican American | 0.0224 | -0.0101 | 0.0549 | 0.3277 | No |
| Other-Mexican American | Other-Mexican American | -0.0533 | -0.0874 | -0.0193 | 0.0002 | Yes |
| Non-Hispanic White-Other Hispanic | Non-Hispanic White-Other Hispanic | -0.0579 | -0.0889 | -0.0269 | 0.0000 | Yes |
| Non-Hispanic Black-Other Hispanic | Non-Hispanic Black-Other Hispanic | 0.0268 | -0.0062 | 0.0598 | 0.1722 | No |
| Other-Other Hispanic | Other-Other Hispanic | -0.0489 | -0.0834 | -0.0144 | 0.0011 | Yes |
| Non-Hispanic Black-Non-Hispanic White | Non-Hispanic Black-Non-Hispanic White | 0.0848 | 0.0612 | 0.1084 | 0.0000 | Yes |
| Other-Non-Hispanic White | Other-Non-Hispanic White | 0.0091 | -0.0166 | 0.0347 | 0.8719 | No |
| Other-Non-Hispanic Black | Other-Non-Hispanic Black | -0.0757 | -0.1038 | -0.0477 | 0.0000 | Yes |
Interpretation: Based on the Tukey-adjusted 95% confidence intervals, ethnic group comparisons whose intervals exclude zero (do not cross the vertical dashed line) show statistically significant differences in mean BMD, whereas comparisons whose intervals include zero indicate no significant difference between those groups.
Pair-wise Interpretation: Non-Hispanic White vs Mexican American: Mean difference = -0.0624 g/cm² (95% CI: -0.0929, -0.0319, p < 0.001). Non-Hispanic White individuals have significantly lower BMD than Mexican Americans.
Other vs Mexican American: Mean difference = -0.0533 g/cm² (95% CI: -0.0874, -0.0193, p = 0.0002). The Other ethnicity group has significantly lower BMD than Mexican Americans.
Non-Hispanic White vs Other Hispanic: Mean difference = -0.0579 g/cm² (95% CI: -0.0889, -0.0269, p < 0.001). Non-Hispanic White individuals have significantly lower BMD than Other Hispanic individuals.
Other vs Other Hispanic: Mean difference = -0.0489 g/cm² (95% CI: -0.0834, -0.0144, p = 0.0011). The Other ethnicity group has significantly lower BMD than Other Hispanic individuals.
Non-Hispanic Black vs Non-Hispanic White: Mean difference = 0.0848 g/cm² (95% CI: 0.0612, 0.1084, p < 0.001). Non-Hispanic Black individuals have significantly higher BMD than Non-Hispanic White individuals.
Other vs Non-Hispanic Black: Mean difference = -0.0757 g/cm² (95% CI: -0.1038, -0.0477, p < 0.001). The Other ethnicity group has significantly lower BMD than Non-Hispanic Black individuals.
# Hint: Extract Sum Sq from the ANOVA summary
# Extract sum of squares from ANOVA table
anova_summary <- summary(anova_model)[[1]]
ss_treatment <- anova_summary$`Sum Sq`[1]
ss_total <- sum(anova_summary$`Sum Sq`)
# Calculate eta-squared
eta_squared <- ss_treatment / ss_total
cat("Eta-squared (η²):", round(eta_squared, 4), "\n")## Eta-squared (η²): 0.0503
## Percentage of variance explained: 5.03 %
# Create formatted output
effect_size_table <- data.frame(
Measure = "Eta-squared (η²)",
Value = round(eta_squared, 4),
`Percent Variance` = paste0(round(eta_squared * 100, 2), "%"),
Interpretation = "Small to Medium Effect"
)
kable(effect_size_table,
caption = "Effect Size: Proportion of Variance Explained by Ethnicity",
col.names = c("Effect Size Measure", "Value", "% Variance Explained", "Interpretation"))| Effect Size Measure | Value | % Variance Explained | Interpretation |
|---|---|---|---|
| Eta-squared (η²) | 0.0503 | 5.03% | Small to Medium Effect |
-Effect Size Interpretation: η² = r round(eta_squared, 4) or r round(eta_squared * 100, 2)% Small
Effect size guidelines: Small effect: η² ≈ 0.01 (1%) Medium effect: η² ≈ 0.06 (6%) Large effect: η² ≈ 0.14 (14%)
Our effect: Table: Effect Size: Proportion of Variance Explained by Ethnicity
| Effect Size Measure | Value | % Variance Explained | Interpretation |
|---|---|---|---|
| Eta-squared (η²) | 0.0503 | 5.03% | Small to Medium Effect |
Interpretation: Race/Ethnicity explains 5.03% of the variance in bone mineral density. This indicates a small to moderate effect size, meaning that while ethnicity is a statistically significant predictor of BMD, it accounts for only a small portion of the total variation in bone density. Other factors (age, sex, lifestyle, genetics, nutrition, etc.) explain the remaining 95% of the variance.
This represents a small to medium effect size, indicating that while ethnicity is a statistically significant predictor of BMD, it accounts for only a modest portion of the total variation. Other factors (genetics, lifestyle, nutrition, physical activity, age, etc.) explain the remaining ~95% of variance in bone mineral density.
DSQTCALC (replace NA with 0) - vitd_total = vitd + DSQTVD (replace NA with 0)
# Replace NA with 0 and create total variables
analytic_data$calcium_total <- analytic_data$calcium +
ifelse(is.na(analytic_data$DSQTCALC), 0, analytic_data$DSQTCALC)
analytic_data$vitd_total <- analytic_data$vitd +
ifelse(is.na(analytic_data$DSQTVD), 0, analytic_data$DSQTVD)
# Verify
summary(analytic_data[, c("calcium_total", "vitd_total")])## calcium_total vitd_total
## Min. : 55.0 Min. : 0.00
## 1st Qu.: 604.5 1st Qu.: 2.95
## Median : 901.5 Median : 8.60
## Mean :1001.5 Mean : 29.12
## 3rd Qu.:1288.5 3rd Qu.: 30.40
## Max. :5399.0 Max. :2574.65
## NA's :157 NA's :157
## [1] 157
## [1] 157
• Create histograms for continuous variables (BMI, calcium, vitamin D, MET) • Decide on transformations based on skewness • Document reasoning
Suggested transformations: - Log transformation: BMI, calcium - Square root: MET, vitamin D (handles zeros better)
# Create histograms for all continuous variables
p1 <- ggplot(analytic_data, aes(x = BMXBMI)) +
geom_histogram(fill = "steelblue", color = "black", bins = 30) +
labs(title = "BMI Distribution", x = "BMI", y = "Frequency") +
theme_minimal()
p2 <- ggplot(analytic_data, aes(x = calcium_total)) +
geom_histogram(fill = "steelblue", color = "black", bins = 30) +
labs(title = "Total Calcium Distribution", x = "Calcium (mg)", y = "Frequency") +
theme_minimal()
p3 <- ggplot(analytic_data, aes(x = vitd_total)) +
geom_histogram(fill = "steelblue", color = "black", bins = 30) +
labs(title = "Total Vitamin D Distribution", x = "Vitamin D (mcg)", y = "Frequency") +
theme_minimal()
p4 <- ggplot(analytic_data, aes(x = totmet)) +
geom_histogram(fill = "steelblue", color = "black", bins = 30) +
labs(title = "Total MET Distribution", x = "MET (min/week)", y = "Frequency") +
theme_minimal()
# Display all plots together
print(p1)## [1] 11
## [1] 157
## [1] 157
## [1] 698
# Check complete cases for correlation analysis
complete_cases <- sum(complete.cases(analytic_data[, c("BMXBMI", "calcium_total", "vitd_total", "totmet")]))
cat("Complete cases for all variables:", complete_cases)## Complete cases for all variables: 1496
Interpretation
Decision: Apply LOG transformation
Justification: Right-skewed shape needs correction No zeros in BMI data, so log is safe to use Log will compress the right tail and make distribution more symmetric for better correlation analysis
Decision: Apply LOG transformation
Justification: Severely right-skewed distribution Potential zero values (some people may have no calcium intake) Log will compress the extreme high values and create more symmetry
Decision: SQUARE ROOT transformation
Justification: Many zeros present (people without supplements), sqrt handles zeros naturally without adding constants, appropriate for heavily zero-inflated data
Decision: SQUARE ROOT transformation
Justification: Many zeros (sedentary individuals), extremely skewed, sqrt handles zeros naturally, appropriate for count-like physical activity data
# Apply transformations based on visual assessment
analytic_data <- analytic_data %>%
mutate(
bmi_log = log(BMXBMI),
calcium_log = log(calcium_total + 1),
vitd_sqrt = sqrt(vitd_total),
met_sqrt = sqrt(totmet)
)
# Check that variables were created
names(analytic_data)## [1] "SEQN" "RIAGENDR" "RIDAGEYR" "RIDRETH1"
## [5] "BMXBMI" "smoker" "totmet" "metcat"
## [9] "DXXOFBMD" "tbmdcat" "calcium" "vitd"
## [13] "DSQTVD" "DSQTCALC" "calcium_total" "vitd_total"
## [17] "bmi_log" "calcium_log" "vitd_sqrt" "met_sqrt"
## [1] 11
## [1] 157
## [1] 157
## [1] 698
# Create comparison plots
# BMI
p5 <- ggplot(analytic_data, aes(x = BMXBMI)) +
geom_histogram(fill = "coral", color = "black", bins = 30) +
labs(title = "Original BMI (Right-Skewed)", x = "BMI") +
theme_minimal()
p6 <- ggplot(analytic_data, aes(x = bmi_log)) +
geom_histogram(fill = "lightgreen", color = "black", bins = 30) +
labs(title = "Log-Transformed BMI (More Normal)", x = "log(BMI)") +
theme_minimal()
print(p5)# Calcium
p7 <- ggplot(analytic_data, aes(x = calcium_total)) +
geom_histogram(fill = "coral", color = "black", bins = 30) +
labs(title = "Original Calcium (Highly Skewed)", x = "Calcium (mg)") +
theme_minimal()
p8 <- ggplot(analytic_data, aes(x = calcium_log)) +
geom_histogram(fill = "lightgreen", color = "black", bins = 30) +
labs(title = "Log-Transformed Calcium (Improved)", x = "log(Calcium+1)") +
theme_minimal()
print(p7)Interpretation BMI Transformation Summary: Before (Coral): Right-skewed with long tail extending to high BMI values (50-60) After (Green): More symmetric and bell-shaped distribution Result: Transformation successful - reduced skewness, improved normality for correlation analysis
Calcium Transformation Summary: Before (Coral): Highly right-skewed, clustered at low values After (Green): Bell-shaped, symmetric, approximately normal Result: Transformation successful - distribution now suitable for correlation analysis
# Create the correlation helper function
corr_pair <- function(data, var1, var2) {
# Remove missing values
complete_data <- data[complete.cases(data[[var1]], data[[var2]]), ]
# Calculate correlation test
cor_test <- cor.test(complete_data[[var1]], complete_data[[var2]],
method = "pearson")
# Extract results
result <- data.frame(
Variable_1 = var1,
Variable_2 = var2,
r = round(cor_test$estimate, 3),
p_value = format.pval(cor_test$p.value, digits = 3),
n = nrow(complete_data)
)
return(result)
}Table A: Predictors vs. Outcome (BMD)
# Calculate correlations between predictors and BMD
table_a <- rbind(
corr_pair(analytic_data, "RIDAGEYR", "DXXOFBMD"), # Age vs BMD
corr_pair(analytic_data, "bmi_log", "DXXOFBMD"), # BMI vs BMD
corr_pair(analytic_data, "calcium_log", "DXXOFBMD"), # Calcium vs BMD
corr_pair(analytic_data, "vitd_sqrt", "DXXOFBMD") # Vitamin D vs BMD
)
# Add descriptive labels
table_a <- table_a %>%
mutate(
Variable_1 = case_when(
Variable_1 == "RIDAGEYR" ~ "Age (years)",
Variable_1 == "bmi_log" ~ "BMI (log-transformed)",
Variable_1 == "calcium_log" ~ "Total Calcium (log-transformed)",
Variable_1 == "vitd_sqrt" ~ "Total Vitamin D (sqrt-transformed)",
TRUE ~ Variable_1
),
Variable_2 = "BMD (g/cm²)"
)
# Display formatted table
kable(table_a,
caption = "Table A: Correlations between Predictors and BMD (Outcome)",
col.names = c("Predictor", "Outcome", "r", "p-value", "n"),
align = c('l', 'l', 'r', 'r', 'r'))| Predictor | Outcome | r | p-value | n | |
|---|---|---|---|---|---|
| cor | Age (years) | BMD (g/cm²) | -0.232 | <2e-16 | 2286 |
| cor1 | BMI (log-transformed) | BMD (g/cm²) | 0.425 | <2e-16 | 2275 |
| cor2 | Total Calcium (log-transformed) | BMD (g/cm²) | 0.012 | 0.592 | 2129 |
| cor3 | Total Vitamin D (sqrt-transformed) | BMD (g/cm²) | -0.062 | 0.00426 | 2129 |
| Predictor | Outcome | r | p-value | n | |
|---|---|---|---|---|---|
| cor | Age (years) | BMD (g/cm²) | -0.232 | <2e-16 | 2286 |
| cor1 | BMI (log-transformed) | BMD (g/cm²) | 0.425 | <2e-16 | 2275 |
| cor2 | Total Calcium (log-transformed) | BMD (g/cm²) | 0.012 | 0.592 | 2129 |
| cor3 | Total Vitamin D (sqrt-transformed) | BMD (g/cm²) | -0.062 | 0.00426 | 2129 |
Interpretation: BMI shows the strongest association with BMD (moderate positive), while age shows a weak negative association. Surprisingly, neither calcium nor vitamin D intake show meaningful positive associations with BMD in this cross-sectional analysis.
Table B: Predictors vs. Exposure (Physical Activity)
# Table B: Predictors vs. Exposure (Physical Activity)
table_b <- rbind(
corr_pair(analytic_data, "RIDAGEYR", "met_sqrt"), # Age vs MET
corr_pair(analytic_data, "bmi_log", "met_sqrt"), # BMI vs MET
corr_pair(analytic_data, "calcium_log", "met_sqrt"), # Calcium vs MET
corr_pair(analytic_data, "vitd_sqrt", "met_sqrt") # Vitamin D vs MET
)
# Add descriptive labels
table_b <- table_b %>%
mutate(
Variable_1 = case_when(
Variable_1 == "RIDAGEYR" ~ "Age (years)",
Variable_1 == "bmi_log" ~ "BMI (log-transformed)",
Variable_1 == "calcium_log" ~ "Total Calcium (log-transformed)",
Variable_1 == "vitd_sqrt" ~ "Total Vitamin D (sqrt-transformed)",
TRUE ~ Variable_1
),
Variable_2 = "Physical Activity (MET, sqrt-transformed)"
)
# Display formatted table
kable(table_b,
digits = 3,
col.names = c("Predictor", "Exposure", "r", "p-value", "n"),
caption = "Table B: Correlations between Predictors and Physical Activity (Exposure)",
align = c('l', 'l', 'r', 'r', 'r'))| Predictor | Exposure | r | p-value | n | |
|---|---|---|---|---|---|
| cor | Age (years) | Physical Activity (MET, sqrt-transformed) | -0.105 | 2.67e-05 | 1588 |
| cor1 | BMI (log-transformed) | Physical Activity (MET, sqrt-transformed) | 0.009 | 0.732 | 1582 |
| cor2 | Total Calcium (log-transformed) | Physical Activity (MET, sqrt-transformed) | 0.068 | 0.00875 | 1500 |
| cor3 | Total Vitamin D (sqrt-transformed) | Physical Activity (MET, sqrt-transformed) | -0.003 | 0.893 | 1500 |
| Predictor | Exposure | r | p-value | n | |
|---|---|---|---|---|---|
| cor | Age (years) | Physical Activity (MET, sqrt-transformed) | -0.105 | 2.67e-05 | 1588 |
| cor1 | BMI (log-transformed) | Physical Activity (MET, sqrt-transformed) | 0.009 | 0.732 | 1582 |
| cor2 | Total Calcium (log-transformed) | Physical Activity (MET, sqrt-transformed) | 0.068 | 0.00875 | 1500 |
| cor3 | Total Vitamin D (sqrt-transformed) | Physical Activity (MET, sqrt-transformed) | -0.003 | 0.893 | 1500 |
Interpretation Age shows a weak negative association with physical activity (older = less active). BMI and vitamin D show no meaningful associations. The calcium-MET relationship is statistically significant but extremely weak (r = 0.068) with minimal practical importance. Vitamin D intake is not associated with physical activity levels - no significant association..
#Table C: Predictors vs. Each Other (Intercorrelations)
table_c <- rbind(
corr_pair(analytic_data, "RIDAGEYR", "bmi_log"),
corr_pair(analytic_data, "RIDAGEYR", "calcium_log"),
corr_pair(analytic_data, "RIDAGEYR", "vitd_sqrt"),
corr_pair(analytic_data, "bmi_log", "calcium_log"),
corr_pair(analytic_data, "bmi_log", "vitd_sqrt"),
corr_pair(analytic_data, "calcium_log", "vitd_sqrt")
)
# Add descriptive labels
table_c <- table_c %>%
mutate(
Variable_1 = case_when(
Variable_1 == "RIDAGEYR" ~ "Age (years)",
Variable_1 == "bmi_log" ~ "BMI (log-transformed)",
Variable_1 == "calcium_log" ~ "Total Calcium (log-transformed)",
Variable_1 == "vitd_sqrt" ~ "Total Vitamin D (sqrt-transformed)",
TRUE ~ Variable_1
),
Variable_2 = case_when(
Variable_2 == "RIDAGEYR" ~ "Age (years)",
Variable_2 == "bmi_log" ~ "BMI (log-transformed)",
Variable_2 == "calcium_log" ~ "Total Calcium (log-transformed)",
Variable_2 == "vitd_sqrt" ~ "Total Vitamin D (sqrt-transformed)",
TRUE ~ Variable_2
)
)
# Display formatted table
kable(table_c,
digits = 3,
col.names = c("Variable 1", "Variable 2", "r", "p-value", "n"),
caption = "Table C: Intercorrelations Among Predictors",
align = c('l', 'l', 'r', 'r', 'r'))| Variable 1 | Variable 2 | r | p-value | n | |
|---|---|---|---|---|---|
| cor | Age (years) | BMI (log-transformed) | -0.093 | 9.28e-06 | 2275 |
| cor1 | Age (years) | Total Calcium (log-transformed) | 0.067 | 0.0021 | 2129 |
| cor2 | Age (years) | Total Vitamin D (sqrt-transformed) | 0.152 | 2.02e-12 | 2129 |
| cor3 | BMI (log-transformed) | Total Calcium (log-transformed) | 0.009 | 0.686 | 2122 |
| cor4 | BMI (log-transformed) | Total Vitamin D (sqrt-transformed) | 0.026 | 0.226 | 2122 |
| cor5 | Total Calcium (log-transformed) | Total Vitamin D (sqrt-transformed) | 0.291 | <2e-16 | 2129 |
| Variable 1 | Variable 2 | r | p-value | n | |
|---|---|---|---|---|---|
| cor | Age (years) | BMI (log-transformed) | -0.093 | 9.28e-06 | 2275 |
| cor1 | Age (years) | Total Calcium (log-transformed) | 0.067 | 0.0021 | 2129 |
| cor2 | Age (years) | Total Vitamin D (sqrt-transformed) | 0.152 | 2.02e-12 | 2129 |
| cor3 | BMI (log-transformed) | Total Calcium (log-transformed) | 0.009 | 0.686 | 2122 |
| cor4 | BMI (log-transformed) | Total Vitamin D (sqrt-transformed) | 0.026 | 0.226 | 2122 |
| cor5 | Total Calcium (log-transformed) | Total Vitamin D (sqrt-transformed) | 0.291 | <2e-16 | 2129 |
Age vs BMI: r = -0.093, p < 0.001, n = 2275 Very weak negative association. Older individuals tend to have slightly lower BMI in this sample. Statistically significant but weak effect.
Age vs Calcium: r = 0.067, p = 0.002, n = 2129 Very weak positive association. Older individuals tend to have slightly higher calcium intake. Statistically significant but minimal practical importance.
Age vs Vitamin D: r = 0.152, p < 0.001, n = 2129 Weak positive association. Older individuals tend to have higher vitamin D intake. This is the strongest age-related correlation among nutrients.
BMI vs Calcium: r = 0.009, p = 0.686, n = 2122 No significant association. BMI is not related to calcium intake.
BMI vs Vitamin D: r = 0.026, p = 0.226, n = 2122 No significant association. BMI is not related to vitamin D intake.
Calcium vs Vitamin D: r = 0.291, p < 0.001, n = 2129 Weak to moderate positive association. Individuals with higher calcium intake tend to have higher vitamin D intake. This is the strongest correlation in Table C, suggesting people who consume more of one nutrient tend to consume more of the other, likely reflecting overall dietary patterns or supplement use.
Write 200–400 words addressing:
A. Comparing Methods (3 points) - When to use ANOVA vs. correlation? - Key differences in what they tell us - Example research questions for each
Answer:
When to use each method: ANOVA is appropriate when comparing means of a continuous outcome across categorical groups (e.g., comparing BMD across ethnic groups). Correlation is used when examining linear relationships between two continuous variables (e.g., age and BMD).
Key differences: ANOVA tests whether group means differ significantly, while correlation quantifies the strength and direction of linear relationships. ANOVA addresses questions like “Do groups differ?” whereas correlation answers “How strongly are variables related?” For instance, ANOVA revealed that ethnicity groups differ in BMD (F = 30.18, p < 0.001), while correlation showed that age is negatively associated with BMD (r = -0.232, p < 0.001).
Example research questions:
ANOVA: Do different treatment groups have different blood pressure levels?
Correlation: Is sodium intake associated with blood pressure?
B. Assumptions & Limitations (4 points) - Which assumptions were challenging? - How might violations affect conclusions? - Limitations of cross-sectional correlation for causal inference
Answer:
Challenging assumptions: The normality assumption was challenging for variables like calcium and MET, which showed severe right-skewness. Transformations (log and square root) were necessary to meet this assumption. The equal variance assumption was easier to satisfy, confirmed by Levene’s test (p = 0.179).
Impact of violations: Violating normality affects correlation coefficients’ accuracy and hypothesis test validity. Severe skewness can inflate Type I error rates and produce misleading correlation estimates. Our transformations addressed this, but some residual non-normality remained in the tails.
Cross-sectional limitations: The most critical limitation is that correlation does not imply causation. Our cross-sectional design prevents determining whether low calcium intake causes poor BMD or whether individuals with existing bone health issues modify their calcium intake. Additionally, we cannot account for unmeasured confounders like genetics, lifetime physical activity history, or medication use. The surprising negative correlation between vitamin D and BMD (r = -0.062) likely reflects reverse causation, people with poor bone health may increase supplement intake.
C. R Programming Challenge (3 points) - What was most difficult? - How did you solve it? - What skill did you strengthen?
Answer:
Most difficult aspect: Honestly, the hardest part was dealing with the transformation workflow and then getting the correlation function to actually work with my data. I kept running into issues where variables I thought existed weren’t being found, or the function would work for one variable but break on another. It was frustrating because the error messages weren’t always clear about what was actually wrong.
Solution approach: I had to slow down and be more systematic. First, I started checking my variable names every single time with names(analytic_data) before writing any new code. That saved me from a lot of “object not found” errors. For the correlation function problems, I eventually figured out that tibbles don’t like the [[]] notation the same way data frames do, so switching to pull() fixed most of my headaches. I also learned to test each correlation individually before trying to combine them all into one table, that way when something broke, I knew exactly which line was the problem.
Skills strengthened: This assignment really forced me to get better at debugging. Before, I would just stare at error messages and hope they made sense. Now I’m more methodical, I check my data structure, verify variable names exist, and test small pieces before building complex code. I also got way more comfortable with the pipe operator and dplyr functions. The whole process of transforming variables, checking them, and then using them in analyses helped me understand how data flows through R in a way that reading examples never did.