# Import the dataset
setwd("C:/Users/AA843241/OneDrive - University at Albany - SUNY/Desktop/EPI553")
bmd = read.csv('bmd.csv', header = T)
names (bmd)## [1] "SEQN" "RIAGENDR" "RIDAGEYR" "RIDRETH1" "BMXBMI" "smoker"
## [7] "totmet" "metcat" "DXXOFBMD" "tbmdcat" "calcium" "vitd"
## [13] "DSQTVD" "DSQTCALC"
## SEQN RIAGENDR RIDAGEYR RIDRETH1 BMXBMI smoker totmet metcat DXXOFBMD
## 1 93705 2 66 4 31.7 2 240 0 1.058
## 2 93708 2 66 5 23.7 3 120 0 0.801
## 3 93709 2 75 4 38.9 1 720 1 0.880
## 4 93711 1 56 5 21.3 3 840 1 0.851
## 5 93713 1 67 3 23.5 1 360 0 0.778
## 6 93714 2 54 4 39.9 2 NA NA 0.994
## 7 93715 1 71 5 22.5 1 6320 2 0.952
## 8 93716 1 61 5 30.7 2 2400 2 1.121
## 9 93721 2 60 1 35.9 3 NA NA NA
## 10 93722 2 60 3 23.8 3 NA NA 0.752
## tbmdcat calcium vitd DSQTVD DSQTCALC
## 1 0 503.5 1.85 20.557 211.67
## 2 1 473.5 5.85 25.000 820.00
## 3 0 NA NA NA NA
## 4 1 1248.5 3.85 25.000 35.00
## 5 1 660.5 2.35 NA 13.33
## 6 0 776.0 5.65 NA NA
## 7 0 452.0 3.75 NA 26.67
## 8 0 853.5 4.45 100.000 1066.67
## 9 NA 929.0 6.05 50.000 35.00
## 10 1 1183.5 6.45 46.667 133.33
CONVERTING variables to factors
# Set seed for reproducibility
set.seed(553)
# Creating variable categories and prepare data
bmd_data <- bmd %>%
filter(RIDAGEYR >= 18, RIDAGEYR <= 80) %>% # Adults 18-80
mutate(
ethnicity_category = case_when(
RIDRETH1 == 1 ~ "Mexican American",
RIDRETH1 == 2 ~ "Other Hispanic",
RIDRETH1 == 3 ~ "Non-Hispanic White",
RIDRETH1 == 4 ~ "Non-Hispanic Black",
RIDRETH1 == 5 ~ "Other" ), #created ethnicity_category
gender_category = case_when(
RIAGENDR == 1 ~ "Male",
RIAGENDR == 2 ~ "Female" ), #Created gender_category
smoker_category = case_when(
smoker == 1 ~ "Current",
smoker == 2 ~ "Past",
smoker == 3 ~ "Never" ), #created smoker_category
# convert to factors
ethnicity_category = factor(ethnicity_category,
levels = c("Mexican American",
"Other Hispanic", "Non-Hispanic White",
"Non-Hispanic Black", "Other")),
gender_category = factor(gender_category,
levels=c("Male", "Female")),
smoker_category = factor(smoker_category,
levels=c("Current", "Past", "Never"))
) %>%
select(DXXOFBMD, RIDAGEYR,RIDRETH1, ethnicity_category, RIAGENDR, gender_category, RIDAGEYR, smoker, smoker_category, BMXBMI,totmet, metcat, tbmdcat, calcium, vitd, DSQTVD, DSQTCALC)
# Show first 6 rows
head (bmd_data) %>%
kable(digits=2, caption = "mutated BMD Dataset (first 6 rows)")| DXXOFBMD | RIDAGEYR | RIDRETH1 | ethnicity_category | RIAGENDR | gender_category | smoker | smoker_category | BMXBMI | totmet | metcat | tbmdcat | calcium | vitd | DSQTVD | DSQTCALC |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1.06 | 66 | 4 | Non-Hispanic Black | 2 | Female | 2 | Past | 31.7 | 240 | 0 | 0 | 503.5 | 1.85 | 20.56 | 211.67 |
| 0.80 | 66 | 5 | Other | 2 | Female | 3 | Never | 23.7 | 120 | 0 | 1 | 473.5 | 5.85 | 25.00 | 820.00 |
| 0.88 | 75 | 4 | Non-Hispanic Black | 2 | Female | 1 | Current | 38.9 | 720 | 1 | 0 | NA | NA | NA | NA |
| 0.85 | 56 | 5 | Other | 1 | Male | 3 | Never | 21.3 | 840 | 1 | 1 | 1248.5 | 3.85 | 25.00 | 35.00 |
| 0.78 | 67 | 3 | Non-Hispanic White | 1 | Male | 1 | Current | 23.5 | 360 | 0 | 1 | 660.5 | 2.35 | NA | 13.33 |
| 0.99 | 54 | 4 | Non-Hispanic Black | 2 | Female | 2 | Past | 39.9 | NA | NA | 0 | 776.0 | 5.65 | NA | NA |
##
## Mexican American Other Hispanic Non-Hispanic White Non-Hispanic Black
## 331 286 1086 696
## Other
## 499
##
## Male Female
## 1431 1467
##
## Current Past Never
## 449 894 1553
# YOUR CODE HERE
bmd_data <- bmd_data %>%
filter(RIDAGEYR >= 18, RIDAGEYR <= 80) %>%
select(DXXOFBMD, RIDAGEYR,RIDRETH1, ethnicity_category, RIAGENDR, gender_category, RIDAGEYR, smoker, smoker_category, BMXBMI,totmet, metcat, tbmdcat, calcium, vitd, DSQTVD, DSQTCALC)
#Display sample size
data.frame(
Metric = "Sample Size",
Value = paste(nrow(bmd_data), "adults")
) %>%
kable()| Metric | Value |
|---|---|
| Sample Size | 2898 adults |
## [1] 6954
## DXXOFBMD RIDAGEYR RIDRETH1 ethnicity_category
## 612 0 0 0
## RIAGENDR gender_category smoker smoker_category
## 0 0 2 2
## BMXBMI totmet metcat tbmdcat
## 64 964 964 612
## calcium vitd DSQTVD DSQTCALC
## 293 293 1515 1633
#Checking the missing values in the cleaned dataset
sum(is.na(bmd_cleaned_data)) # Total missing values## [1] 4186
## DXXOFBMD RIDAGEYR RIDRETH1 ethnicity_category
## 0 0 0 0
## RIAGENDR gender_category smoker smoker_category
## 0 0 0 0
## BMXBMI totmet metcat tbmdcat
## 11 698 698 0
## calcium vitd DSQTVD DSQTCALC
## 157 157 1186 1279
#Display cleaned sample size
data.frame(
Metric = "Sample Size",
Value = paste(nrow(bmd_cleaned_data), "adults")
) %>%
kable()| Metric | Value |
|---|---|
| Sample Size | 2286 adults |
Final cleaned sample size is 2286 adults with 3 new variables ethnicity_category, gender_category, smoker_category
Research Question: Do mean BMD values differ across ethnicity groups
plain language: Bone mineral density is equal across all five ethnicity categories
Alternative hypothesis(H1): At least one population means differs from the others Significance level: α = 0.05
Create table: n, mean, SD of BMD by ethnicity Plot with jittered points showing BMD distributions
# Calculate summary statistics by BMI category
summary_stats <- bmd_cleaned_data %>%
group_by(ethnicity_category) %>%
summarise(
n = n(),
Mean = mean(DXXOFBMD),
SD = sd(DXXOFBMD),
Median = median(DXXOFBMD),
Min = min(DXXOFBMD),
Max = max(DXXOFBMD)
)
summary_stats %>%
kable(digits = 2,
caption = "Descriptive Statistics: Bone mineral density by ethnicity/race")| ethnicity_category | n | Mean | SD | Median | Min | Max |
|---|---|---|---|---|---|---|
| Mexican American | 255 | 0.95 | 0.15 | 0.96 | 0.44 | 1.46 |
| Other Hispanic | 244 | 0.95 | 0.16 | 0.94 | 0.55 | 1.39 |
| Non-Hispanic White | 846 | 0.89 | 0.16 | 0.88 | 0.37 | 1.46 |
| Non-Hispanic Black | 532 | 0.97 | 0.16 | 0.98 | 0.44 | 1.55 |
| Other | 409 | 0.90 | 0.15 | 0.89 | 0.54 | 1.34 |
Observation: The mean BMD appears to slightly lower among Non-Hispanic white(0.9) and Other ethnicity (0.9)
# Create boxplots with individual points
ggplot(bmd_cleaned_data,
aes(x = ethnicity_category, y = DXXOFBMD, fill = ethnicity_category)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.1, size = 0.5) +
scale_fill_brewer(palette = "Set2") +
labs(
title = "BMD by ethnicity Category",
subtitle = "BMD Dataset, Adults aged 18-65",
x = "ethnicity Category",
y = "BMD (g/cm²)",
fill = "ethnicity Category"
) +
theme_minimal(base_size = 12) +
theme(legend.position = "none")
What the plot tells us:
# Fit the one-way ANOVA model
anova_model <- aov(DXXOFBMD ~ ethnicity_category, data = bmd_cleaned_data)
# Display the ANOVA table
summary(anova_model)## Df Sum Sq Mean Sq F value Pr(>F)
## ethnicity_category 4 2.95 0.7371 30.18 <2e-16 ***
## Residuals 2281 55.70 0.0244
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Interpretation:
A: Normality: Residuals are approximately normally distributed B: Homogeneity of variance: Almost equal variances across groups
#Levene's test for homogeneity of variance
levene_test <- leveneTest(DXXOFBMD ~ ethnicity_category, data = bmd_cleaned_data)
print(levene_test)## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 4 1.5728 0.1788
## 2281
Diagnostic Plot Interpretation: Diagnostic plots suggest that assumptions of linearity, normality, and homoscedasticity are reasonably met. The Q-Q plot shows residuals approximately following a normal distribution. The Scale-Location plot does not indicate substantial heteroscedasticity. A few observations show slightly higher leverage, but no strongly influential points are evident.
Levene’s Test Interpretation: - p-value: 0.179 - If p < 0.05, we would reject equal variances - Here: Equal variance assumption is met
Overall Assessment: With n > 2000, ANOVA is robust to minor violations. Our assumptions are reasonably satisfied.
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = DXXOFBMD ~ ethnicity_category, data = bmd_cleaned_data)
##
## $ethnicity_category
## diff lwr upr
## Other Hispanic-Mexican American -0.004441273 -0.042644130 0.03376158
## Non-Hispanic White-Mexican American -0.062377249 -0.092852618 -0.03190188
## Non-Hispanic Black-Mexican American 0.022391619 -0.010100052 0.05488329
## Other-Mexican American -0.053319344 -0.087357253 -0.01928144
## Non-Hispanic White-Other Hispanic -0.057935976 -0.088934694 -0.02693726
## Non-Hispanic Black-Other Hispanic 0.026832892 -0.006150151 0.05981593
## Other-Other Hispanic -0.048878071 -0.083385341 -0.01437080
## Non-Hispanic Black-Non-Hispanic White 0.084768868 0.061164402 0.10837333
## Other-Non-Hispanic White 0.009057905 -0.016633367 0.03474918
## Other-Non-Hispanic Black -0.075710963 -0.103764519 -0.04765741
## p adj
## Other Hispanic-Mexican American 0.9978049
## Non-Hispanic White-Mexican American 0.0000003
## Non-Hispanic Black-Mexican American 0.3276613
## Other-Mexican American 0.0001919
## Non-Hispanic White-Other Hispanic 0.0000036
## Non-Hispanic Black-Other Hispanic 0.1722466
## Other-Other Hispanic 0.0010720
## Non-Hispanic Black-Non-Hispanic White 0.0000000
## Other-Non-Hispanic White 0.8719109
## Other-Non-Hispanic Black 0.0000000
Interpretation: Statistically significant differences
in BMD are among: Non-Hispanic White to Mexican American, Other to
Mexican American, Non-Hispanic White to Other Hispanic, Other to Other
Hispanic, Non-Hispanic Black to Non-Hispanic White and Other to
Non-Hispanic Black because their family-wise Confidence Intervals do not
cross the Null value and p <0.05
# Extract sum of squares from ANOVA table
anova_summary <- summary(anova_model)[[1]]
ss_treatment <- anova_summary$`Sum Sq`[1]
ss_total <- sum(anova_summary$`Sum Sq`)
# Calculate eta-squared
eta_squared <- ss_treatment / ss_total
cat("Eta-squared (η²):", round(eta_squared, 4), "\n")## Eta-squared (η²): 0.0503
## Percentage of variance explained: 5.03 %
Interpretation: ethnicity category explains 5.03% of the variance in Bone Mineral Density.
Research Question: Is there a correlation between ethnicity and BMD among US adults?
# Set seed for reproducibility
set.seed(553)
# Creating variable categories and prepare data
bmd_cor_data <- bmd_cleaned_data %>%
filter(RIDAGEYR >= 18, RIDAGEYR <= 80) %>% # Adults 18-80
mutate(
calcium_total = (coalesce(calcium,0) + coalesce(DSQTCALC,0)),
vitd_total = (coalesce(vitd,0) + coalesce(DSQTVD,0))
)%>%
select(DXXOFBMD, calcium, vitd, DSQTVD, DSQTCALC, calcium_total, vitd_total, RIDAGEYR,RIDRETH1, ethnicity_category, RIAGENDR, gender_category, smoker, smoker_category, BMXBMI,totmet, metcat, tbmdcat)
# Show first 7 rows
head (bmd_cor_data) %>%
kable(digits=2, caption = "BMD Dataset with created total intake variables (first 7 rows)")| DXXOFBMD | calcium | vitd | DSQTVD | DSQTCALC | calcium_total | vitd_total | RIDAGEYR | RIDRETH1 | ethnicity_category | RIAGENDR | gender_category | smoker | smoker_category | BMXBMI | totmet | metcat | tbmdcat |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1.06 | 503.5 | 1.85 | 20.56 | 211.67 | 715.17 | 22.41 | 66 | 4 | Non-Hispanic Black | 2 | Female | 2 | Past | 31.7 | 240 | 0 | 0 |
| 0.80 | 473.5 | 5.85 | 25.00 | 820.00 | 1293.50 | 30.85 | 66 | 5 | Other | 2 | Female | 3 | Never | 23.7 | 120 | 0 | 1 |
| 0.88 | NA | NA | NA | NA | 0.00 | 0.00 | 75 | 4 | Non-Hispanic Black | 2 | Female | 1 | Current | 38.9 | 720 | 1 | 0 |
| 0.85 | 1248.5 | 3.85 | 25.00 | 35.00 | 1283.50 | 28.85 | 56 | 5 | Other | 1 | Male | 3 | Never | 21.3 | 840 | 1 | 1 |
| 0.78 | 660.5 | 2.35 | NA | 13.33 | 673.83 | 2.35 | 67 | 3 | Non-Hispanic White | 1 | Male | 1 | Current | 23.5 | 360 | 0 | 1 |
| 0.99 | 776.0 | 5.65 | NA | NA | 776.00 | 5.65 | 54 | 4 | Non-Hispanic Black | 2 | Female | 2 | Past | 39.9 | NA | NA | 0 |
## [1] 4186
## DXXOFBMD calcium vitd DSQTVD
## 0 157 157 1186
## DSQTCALC calcium_total vitd_total RIDAGEYR
## 1279 0 0 0
## RIDRETH1 ethnicity_category RIAGENDR gender_category
## 0 0 0 0
## smoker smoker_category BMXBMI totmet
## 0 0 11 698
## metcat tbmdcat
## 698 0
We can see that there is no missing values in calcium_total and vitd_total variables
###Create histograms for BMI, calcium, Vitamin D, MET
set.seed(553)
# Calculate measures
mean_BMI <- mean(bmd_cor_data$BMXBMI, na.rm = TRUE)
median_BMI <- median(bmd_cor_data$BMXBMI, na.rm = TRUE)
# Visualize
ggplot(bmd_cor_data, aes(x = BMXBMI)) +
geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
geom_vline(aes(xintercept = mean_BMI, color = "Mean"),
linewidth = 1.2, linetype = "dashed") +
geom_vline(aes(xintercept = median_BMI, color = "Median"),
linewidth = 1.2, linetype = "dashed") +
scale_color_manual(values = c("Mean" = "red", "Median" = "blue")) +
labs(
title = "BMI Distribution (kg/m²)",
x = "BMI", y = "Frequency",
color = "Measure"
) +
theme_minimal()## Mean: 29 kg/m²
## Median: 28.2 kg/m²
set.seed(553)
# Calculate measures
mean_calcium_total <- mean(bmd_cor_data$calcium_total, na.rm = TRUE)
median_calcium_total <- median(bmd_cor_data$calcium_total, na.rm = TRUE)
# Visualize
ggplot(bmd_cor_data, aes(x = calcium_total)) +
geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
geom_vline(aes(xintercept = mean_calcium_total, color = "Mean"),
linewidth = 1.2, linetype = "dashed") +
geom_vline(aes(xintercept = median_calcium_total, color = "Median"),
linewidth = 1.2, linetype = "dashed") +
scale_color_manual(values = c("Mean" = "red", "Median" = "blue")) +
labs(
title = "Total Calcium intake Distribution (mg/day)",
x = "Total Calcium intake", y = "Frequency",
color = "Measure"
) +
theme_minimal()## Mean: 943.83 mg/day
## Median: 865.25 mg/day²
set.seed(553)
# Calculate measures
mean_vitd_total <- mean(bmd_cor_data$vitd_total, na.rm = TRUE)
median_vitd_total <- median(bmd_cor_data$vitd_total, na.rm = TRUE)
# Visualize
ggplot(bmd_cor_data, aes(x = vitd_total)) +
geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
geom_vline(aes(xintercept = mean_vitd_total, color = "Mean"),
linewidth = 1.2, linetype = "dashed") +
geom_vline(aes(xintercept = median_vitd_total, color = "Median"),
linewidth = 1.2, linetype = "dashed") +
scale_color_manual(values = c("Mean" = "red", "Median" = "blue")) +
labs(
title = "Total Vitamin D intake Distribution (mcg/day)",
x = "Total Vitamin D intake", y = "Frequency",
color = "Measure"
) +
theme_minimal()## Mean: 28.81 mcg/day
## Median: 8.34 mcg/day
set.seed(553)
# Calculate measures
mean_totmet <- mean(bmd_cor_data$totmet, na.rm = TRUE)
median_totmet <- median(bmd_cor_data$totmet, na.rm = TRUE)
# Visualize
ggplot(bmd_cor_data, aes(x = totmet)) +
geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
geom_vline(aes(xintercept = mean_totmet, color = "Mean"),
linewidth = 1.2, linetype = "dashed") +
geom_vline(aes(xintercept = median_totmet, color = "Median"),
linewidth = 1.2, linetype = "dashed") +
scale_color_manual(values = c("Mean" = "red", "Median" = "blue")) +
labs(
title = "Physical activity Distribution (MET-min/week)",
x = "Physical activity", y = "Frequency",
color = "Measure"
) +
theme_minimal()## Mean: 1026.47 MET-min/week
## Median: 480 MET-min/week
BMI - almost simmetrical, slightly skewed to the right, means approx. equal to median Totmet - skewed to the right - mean>median Total Calcium intake - skewed to the right - mean>median Total vitamin D intake - skewed to the right - mean>median
Therefore following suggested transformations - BMI, calcium - log, and MET, vitd - Square root
bmd_trans_cor <- bmd_cor_data %>%
mutate(
BMI_log = log(BMXBMI + 0.01),
calcium_log = log(calcium_total + 0.01),
MET_sqrt = sqrt(totmet),
vitd_total_sqrt = sqrt(vitd_total )
)set.seed(553)
# Calculate measures
mean_calcium_log <- mean(bmd_trans_cor$calcium_log, na.rm = TRUE)
median_calcium_log <- median(bmd_trans_cor$calcium_log, na.rm = TRUE)
# Visualize
ggplot(bmd_trans_cor, aes(x = calcium_log)) +
geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
geom_vline(aes(xintercept = mean_calcium_log, color = "Mean"),
linewidth = 1.2, linetype = "dashed") +
geom_vline(aes(xintercept = median_calcium_log, color = "Median"),
linewidth = 1.2, linetype = "dashed") +
scale_color_manual(values = c("Mean" = "red", "Median" = "blue")) +
labs(
title = " Transformed Calcium intake Distribution (mg/day)",
x = "calcium_log", y = "Frequency",
color = "Measure"
) +
theme_minimal()## Mean: 6.25 mg/day
## Median: 6.76 mg/day
set.seed(553)
# Calculate measures
mean_MET_sqrt <- mean(bmd_trans_cor$MET_sqrt, na.rm = TRUE)
median_MET_sqrt <- median(bmd_trans_cor$MET_sqrt, na.rm = TRUE)
# Visualize
ggplot(bmd_trans_cor, aes(x = MET_sqrt)) +
geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
geom_vline(aes(xintercept = mean_MET_sqrt, color = "Mean"),
linewidth = 1.2, linetype = "dashed") +
geom_vline(aes(xintercept = median_MET_sqrt, color = "Median"),
linewidth = 1.2, linetype = "dashed") +
scale_color_manual(values = c("Mean" = "red", "Median" = "blue")) +
labs(
title = "Transformed Physcial activity Distribution (MET-min/Week)",
x = "Met_sqrt", y = "Frequency",
color = "Measure"
) +
theme_minimal()## Mean: 27.38 mg/day
## Median: 21.91 mg/day
set.seed(553)
# Calculate measures
mean_vitd_total_sqrt <- mean(bmd_trans_cor$vitd_total_sqrt, na.rm = TRUE)
median_vitd_total_sqrt <- median(bmd_trans_cor$vitd_total_sqrt, na.rm = TRUE)
# Visualize
ggplot(bmd_trans_cor, aes(x =vitd_total_sqrt)) +
geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
geom_vline(aes(xintercept = mean_vitd_total_sqrt, color = "Mean"),
linewidth = 1.2, linetype = "dashed") +
geom_vline(aes(xintercept = median_vitd_total_sqrt, color = "Median"),
linewidth = 1.2, linetype = "dashed") +
scale_color_manual(values = c("Mean" = "red", "Median" = "blue")) +
labs(
title = "Transformed Vitamin D intake Distribution (mcg/day)",
x = "vitd_total_sqrt", y = "Frequency",
color = "Measure"
) +
theme_minimal()## Mean: 4 mcg/day
## Median: 2.89 mcg/day
#Document reasoning: BMI - almost symmetrical, slightly skewed to the right, means approx. equal to median Totmet - skewed to the right - mean>median Total Calcium intake - skewed to the right - mean>median Total vitamin D intake - skewed to the right - mean>median
corr_pair <- function(x, y) {
test <- cor.test(x, y, method = "pearson")
data.frame(
r = as.numeric(test$estimate),
p_value = test$p.value
)
}
#list of predictors
predictors <- c("RIDAGEYR", "BMI_log", "calcium_log", "vitd_total_sqrt")
# Table A: Predictors vs Outcome (BMD)
table_A <- map_dfr(predictors, function(var) {
corr_pair(bmd_trans_cor[[var]], bmd_trans_cor$DXXOFBMD) %>%
mutate(var1 = var,
var2 = "DXXOFBMD")
})
cat("\nTable A: Predictors vs Outcome (BMD)\n")##
## Table A: Predictors vs Outcome (BMD)
| r | p_value | var1 | var2 |
|---|---|---|---|
| -0.232 | 0.000 | RIDAGEYR | DXXOFBMD |
| 0.425 | 0.000 | BMI_log | DXXOFBMD |
| 0.029 | 0.162 | calcium_log | DXXOFBMD |
| -0.051 | 0.015 | vitd_total_sqrt | DXXOFBMD |
# ️Table B: Predictors vs Exposure (Physical Activity)
table_B <- map_dfr(predictors, function(var) {
corr_pair(bmd_trans_cor[[var]], bmd_trans_cor$MET_sqrt) %>%
mutate(var1 = var,
var2 = "MET_sqrt")
})
cat("\nTable B: Predictors vs Exposure (Physical Activity)\n")##
## Table B: Predictors vs Exposure (Physical Activity)
| r | p_value | var1 | var2 |
|---|---|---|---|
| -0.105 | 0.000 | RIDAGEYR | MET_sqrt |
| 0.009 | 0.732 | BMI_log | MET_sqrt |
| 0.062 | 0.014 | calcium_log | MET_sqrt |
| -0.002 | 0.934 | vitd_total_sqrt | MET_sqrt |
# Table C: Predictors vs Each Other
pairs <- combn(predictors, 2, simplify = FALSE)
table_C <- map_dfr(pairs, function(x) {
corr_pair(bmd_trans_cor[[x[1]]], bmd_trans_cor[[x[2]]]) %>%
mutate(var1 = x[1],
var2 = x[2])
})
cat("\nTable C: Predictors vs Each Other\n")##
## Table C: Predictors vs Each Other
| r | p_value | var1 | var2 |
|---|---|---|---|
| -0.093 | 0.000 | RIDAGEYR | BMI_log |
| -0.032 | 0.124 | RIDAGEYR | calcium_log |
| 0.146 | 0.000 | RIDAGEYR | vitd_total_sqrt |
| 0.038 | 0.071 | BMI_log | calcium_log |
| 0.024 | 0.250 | BMI_log | vitd_total_sqrt |
| 0.191 | 0.000 | calcium_log | vitd_total_sqrt |
# Q-Q plots for normality
par(mfrow = c(1, 2))
qqnorm(bmd_trans_cor$RIDAGEYR, main = "Q-Q Plot: Age")
qqline(bmd_trans_cor$RIDAGEYR, col = "red")
qqnorm(bmd_trans_cor$DXXOFBMD, main = "Q-Q Plot: BMD")
qqline(bmd_trans_cor$DXXOFBMD, col = "red")qqnorm(bmd_trans_cor$BMI_log, main = "Q-Q Plot: BMI")
qqline(bmd_trans_cor$BMI_log, col = "red")
qqnorm(bmd_trans_cor$calcium_log, main = "Q-Q Plot: Calcium")
qqline(bmd_trans_cor$calcium_log, col = "red")qqnorm(bmd_trans_cor$vitd_total_sqrt, main = "Q-Q Plot: VitD")
qqline(bmd_trans_cor$vitd_total_sqrt, col = "red")
qqnorm(bmd_trans_cor$MET_sqrt, main = "Q-Q Plot: Phys Activity")
qqline(bmd_trans_cor$MET_sqrt, col = "red")###Interpretations: 1)In table A we see statistically
significant: - negative weak correlation between Age
and Bone Mineral density, in other words there is weak indirect
association between persons age and BMD - positive moderate correlation
between BMI-log and Bone Mineral density, in other
words there is moderate direct association between persons BMI and BMD
-Negative very weak correlation between Vitamin D intake and
Bone Mineral Density 2) In table B we see statistically
significant: - Negative weak correlation between AGE
and Physical Activity in met-min per week, in other words there
is a weak association between person being older and persons’ physical
activity
- Very weak positive correlation between total Calcium intake
and Physical Activity in met-min per week. Other associations
between predictors and physical activity are not statistically
significant. 3)In table C we see statistically
significant: - negative weak correlation between AGE
and BMI, in other words there is weak association between
person being older and person’s BMI being lower - positive very weak
correlation between AGE and total vitamin D intake
-positive very weak correlation between Total calcium D intake
and total vitamin D intake
###Conclusion: Although Q-Q plots shows some deviation from normality for some variables, Person’s Corellation was used because of the large sample size(>2000), which makes the method relatively robust to moderate departures from normality. Adiitionally we could check with Person’s correlation
corr_pair_spearman <- function(x, y) {
test <- cor.test(x, y, method = "spearman", exact = FALSE)
data.frame(
rho = as.numeric(test$estimate),
p_value = test$p.value
)
}
#list of predictors
predictors <- c("RIDAGEYR", "BMI_log", "calcium_log", "vitd_total_sqrt")
# Table A: Predictors vs Outcome (BMD)
table_A_spear <- map_dfr(predictors, function(var) {
corr_pair_spearman(
bmd_trans_cor[[var]],
bmd_trans_cor$DXXOFBMD
) %>%
mutate(
predictor = var,
outcome = "DXXOFBMD"
)
})
kable(table_A_spear, digits = 3,
caption = "Spearman Correlation: Predictors vs BMD Outcome")| rho | p_value | predictor | outcome |
|---|---|---|---|
| -0.214 | 0.000 | RIDAGEYR | DXXOFBMD |
| 0.402 | 0.000 | BMI_log | DXXOFBMD |
| 0.023 | 0.264 | calcium_log | DXXOFBMD |
| -0.064 | 0.002 | vitd_total_sqrt | DXXOFBMD |
# ️Table B: Predictors vs Exposure (Physical Activity)
table_B_spear <- map_dfr(predictors, function(var) {
corr_pair_spearman(
bmd_trans_cor[[var]],
bmd_trans_cor$MET_sqrt
) %>%
mutate(
predictor = var,
outcome = "MET_sqrt"
)
})
kable(table_B_spear, digits = 3,
caption = "Spearman Correlation: Predictors vs Physical Activity")| rho | p_value | predictor | outcome |
|---|---|---|---|
| -0.095 | 0.000 | RIDAGEYR | MET_sqrt |
| 0.030 | 0.237 | BMI_log | MET_sqrt |
| 0.092 | 0.000 | calcium_log | MET_sqrt |
| -0.003 | 0.889 | vitd_total_sqrt | MET_sqrt |
# Table C: Predictors vs Each Other
pairs <- combn(predictors, 2, simplify = FALSE)
table_C_spear <- map_dfr(pairs, function(x) {
corr_pair_spearman(
bmd_trans_cor[[x[1]]],
bmd_trans_cor[[x[2]]]
) %>%
mutate(
variable1 = x[1],
variable2 = x[2]
)
})
kable(table_C_spear, digits = 3,
caption = "Spearman Correlation: Predictors vs Each Other")| rho | p_value | variable1 | variable2 |
|---|---|---|---|
| -0.084 | 0.000 | RIDAGEYR | BMI_log |
| 0.038 | 0.069 | RIDAGEYR | calcium_log |
| 0.190 | 0.000 | RIDAGEYR | vitd_total_sqrt |
| 0.023 | 0.265 | BMI_log | calcium_log |
| -0.012 | 0.578 | BMI_log | vitd_total_sqrt |
| 0.452 | 0.000 | calcium_log | vitd_total_sqrt |
Using Spearman’s correlation slightly changes the numbers, but overall significance, directions and strength did not change for all variables.
1)We use ANOVA when we need to detect the differences between more than 2 independent samples, for example in our case we have compared BMD between 5 ethnicity groups. Example of research question: if we have changed AGE from continuous variable to categories - younger adults, adults-45-65, older adults >65, we could have a research question- Is there a difference in BMd across all the group categories. 2) We use correlation to see any association that is statistically significant among multiple variables in one sample. Although correlation is not causation, it might give information what should be explored in prospective cohort studies or RCt if we want to determine temporality and causality. Example of research question: Is there any correlation between sugar intake and BMI of a person? As we checked QQ plots for normality of the data we observe some deviation from normality on couple of variables, after calculation there were no drastic difference in results.
As we have rather large sample size (>2000) I basically almost ignored outliers, because ANOVA is robust to outliers. Pearson on the other hand is sensitive for outliers. It would be right to check for outliers for Correlation calculation part. So Q-Q plots show some outliers on variables and deviation from normality in data, therefore I used additionally Spearman’s correlation. Of course limitation of correlation in cross-sectional study is that exposure, predictors and outcome is measured in one point of time, therefore we can not prove temporality. And correlation does not mean caution, we should be very careful with interpretations ans conclusions. Most of the cross-sectional studies conclude - we need to conduct more longitudinal studies to check causality of exposure and outcome variables. In addition, unlike regression analysis, correlation analysis does not adjust for potential confounding variables. Therefore, observed associations may be influenced by other unmeasured factors. Even though multicollinearity diagnostics were considered in the analysis, correlation results should still be interpreted cautiously.