#2. Import dataset for assignment #1
getwd()
## [1] "/Users/samriddhi/Downloads"
setwd("/Users/samriddhi/Desktop/DrPH/HEPI_553_Muntasir Musum/Assignments/Assign#1")
data_bmd <- read.csv("bmd.csv")
#view dataset bmd
str(data_bmd)
## 'data.frame': 2898 obs. of 14 variables:
## $ SEQN : int 93705 93708 93709 93711 93713 93714 93715 93716 93721 93722 ...
## $ RIAGENDR: int 2 2 2 1 1 2 1 1 2 2 ...
## $ RIDAGEYR: int 66 66 75 56 67 54 71 61 60 60 ...
## $ RIDRETH1: int 4 5 4 5 3 4 5 5 1 3 ...
## $ BMXBMI : num 31.7 23.7 38.9 21.3 23.5 39.9 22.5 30.7 35.9 23.8 ...
## $ smoker : int 2 3 1 3 1 2 1 2 3 3 ...
## $ totmet : int 240 120 720 840 360 NA 6320 2400 NA NA ...
## $ metcat : int 0 0 1 1 0 NA 2 2 NA NA ...
## $ DXXOFBMD: num 1.058 0.801 0.88 0.851 0.778 ...
## $ tbmdcat : int 0 1 0 1 1 0 0 0 NA 1 ...
## $ calcium : num 504 474 NA 1248 660 ...
## $ vitd : num 1.85 5.85 NA 3.85 2.35 5.65 3.75 4.45 6.05 6.45 ...
## $ DSQTVD : num 20.6 25 NA 25 NA ...
## $ DSQTCALC: num 211.7 820 NA 35 13.3 ...
summary(data_bmd)
## SEQN RIAGENDR RIDAGEYR RIDRETH1
## Min. : 93705 Min. :1.000 Min. :50.00 Min. :1.000
## 1st Qu.: 95902 1st Qu.:1.000 1st Qu.:58.00 1st Qu.:3.000
## Median : 98233 Median :2.000 Median :64.00 Median :3.000
## Mean : 98257 Mean :1.506 Mean :65.21 Mean :3.257
## 3rd Qu.:100581 3rd Qu.:2.000 3rd Qu.:73.00 3rd Qu.:4.000
## Max. :102952 Max. :2.000 Max. :80.00 Max. :5.000
##
## BMXBMI smoker totmet metcat
## Min. :14.20 Min. :1.000 Min. : 0 Min. :0.0000
## 1st Qu.:25.20 1st Qu.:2.000 1st Qu.: 240 1st Qu.:0.0000
## Median :28.70 Median :3.000 Median : 480 Median :0.0000
## Mean :29.76 Mean :2.381 Mean :1015 Mean :0.7048
## 3rd Qu.:33.40 3rd Qu.:3.000 3rd Qu.:1430 3rd Qu.:1.0000
## Max. :84.40 Max. :3.000 Max. :9120 Max. :2.0000
## NA's :64 NA's :2 NA's :964 NA's :964
## DXXOFBMD tbmdcat calcium vitd
## Min. :0.3700 Min. :0.0000 Min. : 55.0 Min. : 0.000
## 1st Qu.:0.8123 1st Qu.:0.0000 1st Qu.: 525.0 1st Qu.: 1.750
## Median :0.9165 Median :0.0000 Median : 773.0 Median : 3.200
## Mean :0.9223 Mean :0.3985 Mean : 840.5 Mean : 4.471
## 3rd Qu.:1.0320 3rd Qu.:1.0000 3rd Qu.:1049.0 3rd Qu.: 5.500
## Max. :1.5500 Max. :2.0000 Max. :5199.0 Max. :57.900
## NA's :612 NA's :612 NA's :293 NA's :293
## DSQTVD DSQTCALC
## Min. : 0.004 Min. : 0.74
## 1st Qu.: 15.709 1st Qu.: 93.33
## Median : 25.000 Median : 211.67
## Mean : 49.851 Mean : 357.98
## 3rd Qu.: 50.000 3rd Qu.: 500.00
## Max. :2570.000 Max. :3220.00
## NA's :1515 NA's :1633
#3. Convert RIDRETH1 to factor with meaningful ethnicity labels
# Convert RIDRETH1 to factor
data_bmd$RIDRETH1 <- factor(
data_bmd$RIDRETH1,
levels = c(1, 2, 3, 4, 5),
labels = c(
"American, Hispanic",
"Other, Hispanic",
"White, Non-Hispanic",
"Black, Non-Hispanic",
"Other Race"
)
)
table(data_bmd$RIDRETH1)
##
## American, Hispanic Other, Hispanic White, Non-Hispanic Black, Non-Hispanic
## 331 286 1086 696
## Other Race
## 499
#4. Convert RIAGENDR to factor (Male/Female)
data_bmd$RIAGENDR <- factor(data_bmd$RIAGENDR,
levels = c(1, 2 ),
labels = c("Male,",
"Female"
))
table(data_bmd$RIAGENDR)
##
## Male, Female
## 1431 1467
#5. Convert smoker to factor
data_bmd$smoker <- factor(data_bmd$smoker,
levels = c(1, 2, 3),
labels = c("Current", "Past", "Never"))
table(data_bmd$smoker, useNA = "ifany")
##
## Current Past Never <NA>
## 449 894 1553 2
#6. Report total sample size
# Display sample size
data.frame(
Metric = "Sample Size",
Value = paste(nrow(data_bmd), "adults")
) %>%
kable()
| Metric | Value |
|---|---|
| Sample Size | 2898 adults |
#7. Create analytic dataset removing missing values in BMD, ethnicity & gender
data_clean <- data_bmd %>%
filter(!is.na(DXXOFBMD) & !is.na(RIAGENDR)) %>%
droplevels()
#8. Report final analytic sample size
data.frame(
Metric = "Sample Size",
Value = paste(nrow(data_clean), "adults")
) %>%
kable()
| Metric | Value |
|---|---|
| Sample Size | 2286 adults |
#Part 1: One-Way ANOVA #Step 1:State null and alternative hypotheses
#Research question: Does bone mineral density (BMD) differ across various ethnic groups?
Null hypothesis (H₀): 𝐻0:𝜇1=𝜇2=𝜇3=𝜇4= 𝜇5. Mean BMD is equal across all the ethnic groups
Alternative hypothesis (H₁): At least one ethnic group’s mean BMD differs from the others
Bone mineral density (BMD) is a critical indicator of bone health and osteoporosis risk. Understanding racial and ethnic disparities in BMD, along with factors associated with bone health, is essential for identifying populations at higher risk of osteoporosis.Examining these disparities can inform targeted screening programs, particularly among vulnerable or underserved populations. Additionally, to implement better nutritional intervention strategies. This will also help to develop clinical guidelines for osteoporosis prevention. Ultimately, this research has the potential to promote health equity initiatives by addressing disparities in bone health outcomes and access to preventive care.
#Step 2: Exploratory Analysis
summary_stats <- data_clean %>%
group_by(RIDRETH1) %>%
summarise(
n = n(),
Mean = mean(DXXOFBMD, na.rm = TRUE),
SD = sd(DXXOFBMD, na.rm = TRUE),
Median = median(DXXOFBMD, na.rm = TRUE),
Min = min(DXXOFBMD, na.rm = TRUE),
Max = max(DXXOFBMD, na.rm = TRUE),
.groups = "drop"
)
summary_stats %>%
kable(digits = 3,
caption = "Descriptive statistics for BMD by ethnicity")
| RIDRETH1 | n | Mean | SD | Median | Min | Max |
|---|---|---|---|---|---|---|
| American, Hispanic | 255 | 0.950 | 0.149 | 0.955 | 0.442 | 1.456 |
| Other, Hispanic | 244 | 0.946 | 0.156 | 0.936 | 0.552 | 1.394 |
| White, Non-Hispanic | 846 | 0.888 | 0.160 | 0.881 | 0.370 | 1.456 |
| Black, Non-Hispanic | 532 | 0.973 | 0.161 | 0.982 | 0.439 | 1.550 |
| Other Race | 409 | 0.897 | 0.148 | 0.890 | 0.540 | 1.340 |
#Visual Exploration: Boxplot for BMD
ggplot(data_clean, aes(x = RIDRETH1, y = DXXOFBMD, fill = RIDRETH1)) +
geom_boxplot(alpha = 0.7) +
geom_jitter(width = 0.2, alpha = 0.3, size = 0.5) +
scale_fill_brewer(palette = "Set2") +
labs(
title = "Bone Mineral Density by Etnicityl",
subtitle = "NHANES Bone Mineral Density Data",
x = "Ethnicity",
y = "Bone Mineral Density (g/cm^2)",
fill = "RIDRETH1"
) +
theme_minimal(base_size = 12) +
theme(legend.position = "none")
#Step 3: ANOVA F-test
# Fit one-way ANOVA
anova_model <- aov(DXXOFBMD ~ RIDRETH1, data = data_clean)
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## RIDRETH1 4 2.95 0.7371 30.18 <2e-16 ***
## Residuals 2281 55.70 0.0244
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Display ANOVA table
anova_table <- summary(anova_model)
print(anova_table)
## Df Sum Sq Mean Sq F value Pr(>F)
## RIDRETH1 4 2.95 0.7371 30.18 <2e-16 ***
## Residuals 2281 55.70 0.0244
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Create a formatted ANOVA table
anova_df <- data.frame(
Source = c("Activity Group", "Residuals"),
DF = c(anova_table[[1]]$Df[1], anova_table[[1]]$Df[2]),
`Sum Sq` = c(anova_table[[1]]$`Sum Sq`[1], anova_table[[1]]$`Sum Sq`[2]),
`Mean Sq` = c(anova_table[[1]]$`Mean Sq`[1], anova_table[[1]]$`Mean Sq`[2]),
`F value` = c(anova_table[[1]]$`F value`[1], NA),
`Pr(>F)` = c(anova_table[[1]]$`Pr(>F)`[1], NA)
)
kable(anova_df, digits = 4,
caption = "ANOVA table for BMD by Etnicity",
col.names = c("Source", "DF", "Sum Sq", "Mean Sq", "F value", "p-value"))
| Source | DF | Sum Sq | Mean Sq | F value | p-value |
|---|---|---|---|---|---|
| Activity Group | 4 | 2.9482 | 0.7371 | 30.185 | 0 |
| Residuals | 2281 | 55.6973 | 0.0244 | NA | NA |
#Interpretation (decision) F-statistic: 30.185 p-value: 2e-16 Conclusion: Since p < 0.05, we reject the null hypothesis. There is statistically significant evidence that mean BMD differs across at least one etnicity groups.
#Step 4: Assumption Checks, A: Normality
par(mfrow = c(2, 2), mar = c(4, 4, 2, 1))
plot(anova_model)
par(mfrow = c(1, 1))
#Intepretation The Residuals vs Fitted plot show random scatter with no visible pattern. Points are roughly equally spread above and below zero, indicating that the independence assumption holds and the relationship between ethnicity and BMD is appropriately linear for this model. The Q-Q plot (top-right), test for normality. Points does fall close to the diagonal line suggesting fair normality. The Scale-Location plot (bottom-left) tests the homogeneity of variance assumption. The red line is horizontal, and points form a clear upward or downward trend. Suggesting that variance increases or decreases across fitted values. The Residuals vs Leverage plot (bottom-right) identifies outliers and influential observations.Data points are inside the dashed Cook’s distance lines.
#B. Equal Variances: - Conduct Levene’s test
#Levene's test for homogeneity of variance
levene_test <- leveneTest(DXXOFBMD ~ RIDRETH1, data = data_clean)
print(levene_test)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 4 1.5728 0.1788
## 2281
#Levene’s Test Interpretation: p-value: 0.1788 If p > 0.05, Since p value is greater than we 0.05, This indicates that the variances are not significantly different across the groups, therefore the assumption of homogeneity of variance is satisfied.
#Step 5: Post-hoc Comparisons
# Conduct Tukey HSD test
tukey_results <- TukeyHSD(anova_model)
print(tukey_results)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = DXXOFBMD ~ RIDRETH1, data = data_clean)
##
## $RIDRETH1
## diff lwr upr
## Other, Hispanic-American, Hispanic -0.004441273 -0.042644130 0.03376158
## White, Non-Hispanic-American, Hispanic -0.062377249 -0.092852618 -0.03190188
## Black, Non-Hispanic-American, Hispanic 0.022391619 -0.010100052 0.05488329
## Other Race-American, Hispanic -0.053319344 -0.087357253 -0.01928144
## White, Non-Hispanic-Other, Hispanic -0.057935976 -0.088934694 -0.02693726
## Black, Non-Hispanic-Other, Hispanic 0.026832892 -0.006150151 0.05981593
## Other Race-Other, Hispanic -0.048878071 -0.083385341 -0.01437080
## Black, Non-Hispanic-White, Non-Hispanic 0.084768868 0.061164402 0.10837333
## Other Race-White, Non-Hispanic 0.009057905 -0.016633367 0.03474918
## Other Race-Black, Non-Hispanic -0.075710963 -0.103764519 -0.04765741
## p adj
## Other, Hispanic-American, Hispanic 0.9978049
## White, Non-Hispanic-American, Hispanic 0.0000003
## Black, Non-Hispanic-American, Hispanic 0.3276613
## Other Race-American, Hispanic 0.0001919
## White, Non-Hispanic-Other, Hispanic 0.0000036
## Black, Non-Hispanic-Other, Hispanic 0.1722466
## Other Race-Other, Hispanic 0.0010720
## Black, Non-Hispanic-White, Non-Hispanic 0.0000000
## Other Race-White, Non-Hispanic 0.8719109
## Other Race-Black, Non-Hispanic 0.0000000
# Visualize the confidence intervals
plot(tukey_results, las = 0)
#Interpretation of Tukey test Tukey test reveled that post-hoc tests indicated significant difference in BMD between some ethnic groups. White, Non-Hispanic has lower BMD than American, Hispanic (-0.0624),with a significance level of 0.0000003. Other Race has a lower BMD than Mexican American (-0.0533),with a significance level of 0.0001919. White, Non-Hispanic has a lower BMD than other, Hispanic (-0.058), with a significance level of 0.0000036. Other Race has a lower BMD than other Hispanic (-0.049), with a significance level of 0.0010720. Black, Non-Hispanic has a higher BMD than White, Non-Hispanic (0.085) with a significance level of 0.0000000. Other Race-Black has a lower BMD than Non-Hispanic (-0.076) with a significance level of 0.0000000. No significant differences were found between Other Hispanic and Mexican American, Black, Non-Hispanic and Mexican American, Black, Non-Hispanic and Other Hispanic, or Other Race and White, Non-Hispanic groups.
#Step 6: Effect Size
# Extract sum of squares from ANOVA table
anova_summary <- summary(anova_model)[[1]]
ss_treatment <- anova_summary$`Sum Sq`[1]
ss_total <- sum(anova_summary$`Sum Sq`)
# Calculate eta-squared
eta_squared <- ss_treatment / ss_total
cat("Eta-squared (η²):", round(eta_squared, 4), "\n")
## Eta-squared (η²): 0.0503
cat("Percentage of variance explained:", round(eta_squared * 100, 2), "%")
## Percentage of variance explained: 5.03 %
#Interpretation of effect size While the F-test indicates a statistically significant difference across ethnicity groups, the practical magnitude of this effect is 0.05, which is small. Ethnicity explains 5.03% of BMD variance.
#Part 2: Correlation Analysis
#Step 1: Create Total Intake Variables
#NA removed from DSQTCALC &DSQTVD
bmd_clean <- data_clean %>%
filter(!is.na(DSQTCALC) & !is.na(DSQTVD))
data_clean <- data_clean %>%
filter(!is.na(DSQTCALC) & !is.na(DSQTVD)) %>%
mutate(
calcium_total = calcium + DSQTCALC,
vitd_total = vitd + DSQTVD
)
summary(data_clean$calcium_total)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 160.5 850.9 1164.3 1256.1 1583.8 3945.0 54
plot(pressure)
#Step 2: Assess and Apply Transformations
#Create histograms for continuous variables (BMI, calcium, vitamin D, MET)
hist(data_clean$BMXBMI,
main = "Histogram of BMI",
xlab = "BMI",
col = "lightblue")
#The distribution of BMI sightly rightly skewed.
#indicating that most participants have lower to moderate intake levels,
#with a smaller number reporting very high intake values.
#This pattern suggests the presence of high-value outliers.
#Suggested transformation=Log transformation
hist(data_clean$calcium,
main = "Histogram of Total Calcium Intake",
xlab = "Calcium Intake",
col = "darkgreen")
#Intepretation: The distribution of Total Calcium Intake is sightly rightly skewed. indicating that most participants have lower to moderate intake levels, with a smaller number reporting very high intake values. This pattern suggests the presence of high-value outliers. Suggested transformation=Log transformation
#Step 2: Assess and Apply Transformations
hist(data_clean$vitd,
main = "Histogram of Total Vitamin D Intake",
xlab = "Vitamin D Intake",
col = "pink")
Intepretation: The distribution of Total Vitamin D Intake is rightly
skewed,indicating that most participants have lower to moderate intake
levels, with a smaller number reporting very high intake values. This
pattern suggests the presence of high-value outliers.Suggested
transformation=Square root transformation
#Step 2: Assess and Apply Transformations
hist(data_clean$totmet,
main = "Histogram of Physical activity",
xlab = "Physical activity",
col = "purple")
Intepretation: The distribution of Physical activity is rightly skewed.
This means that the most data points are towards the higher end of
tail.Suggested transformation=Square root transformation
#Step 2: Apply Transformations
data_clean <- data_clean %>%
mutate(
log_BMI = log(BMXBMI),
log_calcium = log(calcium),
sqrt_vitd = sqrt(vitd),
sqrt_MET = sqrt(totmet)
)
# Create the correlation helper function
corr_pair <- function(data, var1, var2) {
# Remove missing values
data_cor <- data[complete.cases(data[[var1]], data[[var2]]), ]
# Calculate correlation test
cor_test <- cor.test(data_cor[[var1]], data_cor[[var2]],
method = "pearson")
# Extract results
result <- data.frame(
Variable_1 = var1,
Variable_2 = var2,
r = round(cor_test$estimate, 3),
p_value = format.pval(cor_test$p.value, digits = 3),
n = nrow(data_cor)
)
return(result)
}
# Table A: Predictors vs Outcome (BMD)
table_a <- rbind(
corr_pair(data_clean, "RIDAGEYR", "DXXOFBMD"),
corr_pair(data_clean, "log_BMI", "DXXOFBMD"),
corr_pair(data_clean, "log_calcium", "DXXOFBMD"),
corr_pair(data_clean, "sqrt_vitd", "DXXOFBMD")
)
# Add descriptive labels
table_a <- table_a %>%
mutate(
Variable_1 = case_when(
Variable_1 == "RIDAGEYR" ~ "Age (years)",
Variable_1 == "log_BMI" ~ "BMI (log-transformed)",
Variable_1 == "log_calcium" ~ "Total Calcium (log-transformed)",
Variable_1 == "sqrt_vitd" ~ "Total Vitamin D (sqrt-transformed)",
TRUE ~ Variable_1
),
Variable_2 = "BMD (g/cm²)"
)
# Display formatted table
kable(table_a,
caption = "Table A: Correlations between Predictors vs BMD (Outcome)",
col.names = c("Predictor", "Outcome", "r", "p-value", "n"),
align = c('l', 'l', 'r', 'r', 'r'))
| Predictor | Outcome | r | p-value | n | |
|---|---|---|---|---|---|
| cor | Age (years) | BMD (g/cm²) | -0.179 | 2.58e-07 | 817 |
| cor1 | BMI (log-transformed) | BMD (g/cm²) | 0.440 | <2e-16 | 810 |
| cor2 | Total Calcium (log-transformed) | BMD (g/cm²) | 0.080 | 0.0272 | 763 |
| cor3 | Total Vitamin D (sqrt-transformed) | BMD (g/cm²) | 0.026 | 0.469 | 763 |
#Interpretation (Table A): BMI showed a moderate positive correlation with bone mineral density (BMD) (r=0.42, p value=2x10^(-16)), indicating that individuals with higher BMI tended to have higher BMD. In contrast, age demonstrated a weak negative correlation with BMD (r=-0.232, p value=2x10^(-16)), suggesting that BMD decreases slightly with increasing age. There was no statistically significant correlation observed between BMD and total calcium intake or total vitamin D intake.
#Table B: Predictors vs. Exposure (Physical Activity)
table_b <- rbind(
corr_pair(data_clean, "RIDAGEYR", "sqrt_MET"),
corr_pair(data_clean, "log_BMI", "sqrt_MET"),
corr_pair(data_clean, "log_calcium", "sqrt_MET"),
corr_pair(data_clean, "sqrt_vitd", "sqrt_MET")
)
# Add descriptive labels
table_b <- table_b %>%
mutate(
Variable_1 = case_when(
Variable_1 == "RIDAGEYR" ~ "Age (years)",
Variable_1 == "log_BMI" ~ "BMI (log-transformed)",
Variable_1 == "log_calcium" ~ "Total Calcium (log-transformed)",
Variable_1 == "sqrt_vitd" ~ "Total Vitamin D (sqrt-transformed)",
TRUE ~ Variable_1
),
Variable_2 = "Physical Activity (MET, sqrt-transformed)"
)
# Display formatted table
kable(table_b,
caption = "Table B: Correlations between Predictors vs Physical Activity (Exposure)",
col.names = c("Predictor", "Exposure", "r", "p-value", "n"),
align = c('l', 'l', 'r', 'r', 'r'))
| Predictor | Exposure | r | p-value | n | |
|---|---|---|---|---|---|
| cor | Age (years) | Physical Activity (MET, sqrt-transformed) | -0.064 | 0.119 | 597 |
| cor1 | BMI (log-transformed) | Physical Activity (MET, sqrt-transformed) | 0.055 | 0.178 | 592 |
| cor2 | Total Calcium (log-transformed) | Physical Activity (MET, sqrt-transformed) | 0.104 | 0.0134 | 561 |
| cor3 | Total Vitamin D (sqrt-transformed) | Physical Activity (MET, sqrt-transformed) | 0.074 | 0.0796 | 561 |
#Interpretation (Table B): Age shows a weak negative association with physical activity (r=-0.105, p value=2.67x 10^(-5)), indicating that physical activity levels tend to decrease slightly with increasing age. The relationship between calcium intake and physical activity was statistically significant but the coorelation is small (r = 0.093, p value=0.0003), suggesting minimum significance. Vitamin D and BMI intake was not significantly associated with physical activity levels (p value = 0.128 & 0.732 respectively).
#Table C: Predictors vs. Each Other (Intercorelation)
table_c <- rbind(
corr_pair(data_clean, "RIDAGEYR", "log_BMI"),
corr_pair(data_clean, "RIDAGEYR", "log_calcium"),
corr_pair(data_clean, "RIDAGEYR", "sqrt_vitd"),
corr_pair(data_clean, "log_BMI", "log_calcium"),
corr_pair(data_clean, "log_BMI", "sqrt_vitd"),
corr_pair(data_clean, "log_calcium", "sqrt_vitd")
)
# Add descriptive labels
table_c <- table_c %>%
mutate(
Variable_1 = case_when(
Variable_1 == "RIDAGEYR" ~ "Age (years)",
Variable_1 == "log_BMI" ~ "BMI (log-transformed)",
Variable_1 == "log_calcium" ~ "Total Calcium (log-transformed)",
Variable_1 == "sqrt_vitd" ~ "Total Vitamin D (sqrt-transformed)",
TRUE ~ Variable_1
),
Variable_2 = case_when(
Variable_2 == "RIDAGEYR" ~ "Age (years)",
Variable_2 == "log_BMI" ~ "BMI (log-transformed)",
Variable_2 == "log_calcium" ~ "Total Calcium (log-transformed)",
Variable_2 == "sqrt_vitd" ~ "Total Vitamin D (sqrt-transformed)",
TRUE ~ Variable_2
)
)
# Display formatted table
kable(table_c,
caption = "Table C: Intercorrelations Among Predictors",
col.names = c("Variable 1", "Variable 2", "r", "p-value", "n"),
align = c('l', 'l', 'r', 'r', 'r'))
| Variable 1 | Variable 2 | r | p-value | n | |
|---|---|---|---|---|---|
| cor | Age (years) | BMI (log-transformed) | -0.015 | 0.665 | 810 |
| cor1 | Age (years) | Total Calcium (log-transformed) | -0.051 | 0.16 | 763 |
| cor2 | Age (years) | Total Vitamin D (sqrt-transformed) | 0.026 | 0.468 | 763 |
| cor3 | BMI (log-transformed) | Total Calcium (log-transformed) | -0.009 | 0.794 | 759 |
| cor4 | BMI (log-transformed) | Total Vitamin D (sqrt-transformed) | -0.034 | 0.349 | 759 |
| cor5 | Total Calcium (log-transformed) | Total Vitamin D (sqrt-transformed) | 0.512 | <2e-16 | 763 |
#Intepretation-Age demonstrated a very weak negative correlation with log-transformed BMI (r = −0.09, p value=9.28x 10^(-06)), indicating that older individuals in this sample tended to have slightly lower BMI, although the magnitude of the relationship was minimal. Age was not significantly correlated with total calcium intake (r = −0.002, p = 0.921), suggesting no meaningful relationship between age and calcium consumption. A very weak positive correlation was observed between age and total vitamin D intake (r = 0.08, p value=0.00011), indicating slightly higher vitamin D intake among older participants, although the effect size was small.BMI showed no significant correlations with either calcium intake (r = 0.02, p = 0.357) or vitamin D intake (r = −0.01, p = 0.524), suggesting that body mass was not associated with vit D or calcium nutrient intake in this sample. Total calcium intake and total vitamin D intake demonstrated a moderate positive correlation (r = 0.55, p value=2x 10^(-16)), representing the strongest relationship among the predictors. This finding suggests that individuals with higher intake of one nutrient tended to have higher intake of the other, likely reflecting shared dietary behaviors or supplement use.
#Part 3: Reflection: When to use ANOVA vs. correlation ANOVA (Analysis of Variance) was used to compare means of a continuous variable (BMD)across multiple groups defined by a categorical variable (race/ethnicity).This test determines whether at least one group mean differs significantly from the others.In this analysis, ANOVA was applied to answer the research question: Does mean BMD differ by race/ethnicity?”
#Correlation Correlation analysis measures the strength and direction of the linear relationship between two continuous variables. It does not imply causation, only association.In this analysis, Pearson correlation was used to answer the research question:“Is BMI correlated with total calcium intake among adults?”
#B. Assumptions & Limitations
Several assumptions underlying Pearson correlation were potentially challenging to fully satisfy. First, normality of variables, particularly dietary intake variables (calcium and vitamin D) and physical activity, was difficult because these measures were right-skewed and required transformation. Second, linearity between predictors and bone mineral density (BMD) may not be perfect, as biological relationships are sometimes nonlinear across age ranges.
Violations of these assumptions can influence both the magnitude and statistical significance of correlations. Non-normal distributions and outliers can inflate or attenuate correlation coefficients, potentially leading to misleading interpretations of strength. Nonlinearity may cause Pearson correlations to underestimate true relationships if associations are curved rather than linear.
Because the data are cross-sectional, correlations cannot establish causality or temporal direction. Observed relationships represent associations at a single point in time and may be influenced by confounding variables, reverse causation, or shared underlying factors. For example, higher BMI may be associated with higher BMD, but this does not imply that increasing BMI causes improved bone density. Therefore, causal conclusions cannot be drawn, and longitudinal or experimental studies would be required to establish causal relationships.
#C. R Programming Challenge The assignment is broken into multiple sections, making it easy to follow and complete each step sequentially. For me, the correlation part was the most challenging, particularly because log transformation of data has not yet been covered in our course, yet. Additionally, ensuring that missing values were properly handled before calculations required careful attention.