#Epi553 Homework 1 : ANOVA and Correlation analysis

Part 0: Setup and Data Preparation

##Step1: Loading packages and importing csv file

# Load necessary libraries
library(tidyverse)   # For data manipulation and visualization
library(knitr)       # For nice tables
library(car)         # For Levene's test
library(NHANES)      # NHANES dataset

##Step2: Importing BMD.csv file

# Import the dataset
setwd("C:/Users/AA843241/OneDrive - University at Albany - SUNY/Desktop/EPI553")
bmd = read.csv('bmd.csv', header = T)     
names (bmd)

##  [1] "SEQN"     "RIAGENDR" "RIDAGEYR" "RIDRETH1" "BMXBMI"   "smoker"  
##  [7] "totmet"   "metcat"   "DXXOFBMD" "tbmdcat"  "calcium"  "vitd"    
## [13] "DSQTVD"   "DSQTCALC"

head (bmd, n=10)

##     SEQN RIAGENDR RIDAGEYR RIDRETH1 BMXBMI smoker totmet metcat DXXOFBMD
## 1  93705        2       66        4   31.7      2    240      0    1.058
## 2  93708        2       66        5   23.7      3    120      0    0.801
## 3  93709        2       75        4   38.9      1    720      1    0.880
## 4  93711        1       56        5   21.3      3    840      1    0.851
## 5  93713        1       67        3   23.5      1    360      0    0.778
## 6  93714        2       54        4   39.9      2     NA     NA    0.994
## 7  93715        1       71        5   22.5      1   6320      2    0.952
## 8  93716        1       61        5   30.7      2   2400      2    1.121
## 9  93721        2       60        1   35.9      3     NA     NA       NA
## 10 93722        2       60        3   23.8      3     NA     NA    0.752
##    tbmdcat calcium vitd  DSQTVD DSQTCALC
## 1        0   503.5 1.85  20.557   211.67
## 2        1   473.5 5.85  25.000   820.00
## 3        0      NA   NA      NA       NA
## 4        1  1248.5 3.85  25.000    35.00
## 5        1   660.5 2.35      NA    13.33
## 6        0   776.0 5.65      NA       NA
## 7        0   452.0 3.75      NA    26.67
## 8        0   853.5 4.45 100.000  1066.67
## 9       NA   929.0 6.05  50.000    35.00
## 10       1  1183.5 6.45  46.667   133.33

##Step 3-5: Converting variables to factors

CONVERTING variables to factors

# Set seed for reproducibility
set.seed(553)

# Creating variable categories and prepare data
bmd_data <- bmd %>%
  filter(RIDAGEYR >= 18, RIDAGEYR <= 80) %>%  # Adults 18-80
  mutate(
    ethnicity_category = case_when(
      RIDRETH1 == 1 ~ "Mexican American",
      RIDRETH1 == 2 ~ "Other Hispanic",
      RIDRETH1 == 3 ~ "Non-Hispanic White",
      RIDRETH1 == 4 ~ "Non-Hispanic Black",
      RIDRETH1 == 5 ~ "Other" ),          #created ethnicity_category
    gender_category = case_when(
      RIAGENDR == 1 ~ "Male",
      RIAGENDR == 2 ~ "Female" ),        #Created gender_category
    smoker_category = case_when(
      smoker == 1 ~ "Current",
      smoker == 2 ~ "Past",
      smoker == 3 ~ "Never" ),          #created smoker_category
    
  # convert to factors 
   ethnicity_category = factor(ethnicity_category, 
                               levels = c("Mexican American",
                                        "Other Hispanic", "Non-Hispanic White",
                                        "Non-Hispanic Black", "Other")),  
  gender_category = factor(gender_category,
                             levels=c("Male", "Female")),
  smoker_category = factor(smoker_category,
                             levels=c("Current", "Past", "Never"))
  ) %>%
    
  select(DXXOFBMD, RIDAGEYR,RIDRETH1, ethnicity_category, RIAGENDR, gender_category, RIDAGEYR, smoker, smoker_category, BMXBMI,totmet, metcat, tbmdcat, calcium, vitd, DSQTVD, DSQTCALC)
  
# Show first 6 rows
head (bmd_data) %>%
  kable(digits=2, caption = "mutated BMD Dataset (first 6 rows)")

mutated BMD Dataset (first 6 rows)
DXXOFBMD	RIDAGEYR	RIDRETH1	ethnicity_category	RIAGENDR	gender_category	smoker	smoker_category	BMXBMI	totmet	metcat	tbmdcat	calcium	vitd	DSQTVD	DSQTCALC
1.06	66	4	Non-Hispanic Black	2	Female	2	Past	31.7	240	0	0	503.5	1.85	20.56	211.67
0.80	66	5	Other	2	Female	3	Never	23.7	120	0	1	473.5	5.85	25.00	820.00
0.88	75	4	Non-Hispanic Black	2	Female	1	Current	38.9	720	1	0	NA	NA	NA	NA
0.85	56	5	Other	1	Male	3	Never	21.3	840	1	1	1248.5	3.85	25.00	35.00
0.78	67	3	Non-Hispanic White	1	Male	1	Current	23.5	360	0	1	660.5	2.35	NA	13.33
0.99	54	4	Non-Hispanic Black	2	Female	2	Past	39.9	NA	NA	0	776.0	5.65	NA	NA

# Check sample sizes
table(bmd_data$ethnicity_category)

## 
##   Mexican American     Other Hispanic Non-Hispanic White Non-Hispanic Black 
##                331                286               1086                696 
##              Other 
##                499

table(bmd_data$gender_category)

## 
##   Male Female 
##   1431   1467

table(bmd_data$smoker_category)

## 
## Current    Past   Never 
##     449     894    1553

##Step 6-7: Reporting original sample size and removing missing values

# YOUR CODE HERE
bmd_data <- bmd_data %>%
  filter(RIDAGEYR >= 18, RIDAGEYR <= 80) %>%
  select(DXXOFBMD, RIDAGEYR,RIDRETH1, ethnicity_category, RIAGENDR, gender_category, RIDAGEYR, smoker, smoker_category, BMXBMI,totmet, metcat, tbmdcat, calcium, vitd, DSQTVD, DSQTCALC)

#Display sample size
data.frame(
  Metric = "Sample Size",
  Value = paste(nrow(bmd_data), "adults")
  ) %>%
kable()

Metric	Value
Sample Size	2898 adults

#Exploring the missing values in the dataset
sum(is.na(bmd_data)) # Total missing values

## [1] 6954

colSums(is.na(bmd_data)) # Missing values per column

##           DXXOFBMD           RIDAGEYR           RIDRETH1 ethnicity_category 
##                612                  0                  0                  0 
##           RIAGENDR    gender_category             smoker    smoker_category 
##                  0                  0                  2                  2 
##             BMXBMI             totmet             metcat            tbmdcat 
##                 64                964                964                612 
##            calcium               vitd             DSQTVD           DSQTCALC 
##                293                293               1515               1633

#Dropping missing values from column
library(tidyr)
# Remove rows where 'DXXOFBMD' or 'ethnicity_category' has NA
bmd_cleaned_data <- bmd_data %>% drop_na(DXXOFBMD, ethnicity_category)

##Step 8: Reporting final analytic sample

#Checking the missing values in the cleaned dataset
sum(is.na(bmd_cleaned_data)) # Total missing values

## [1] 4186

colSums(is.na(bmd_cleaned_data)) # Missing values per column

##           DXXOFBMD           RIDAGEYR           RIDRETH1 ethnicity_category 
##                  0                  0                  0                  0 
##           RIAGENDR    gender_category             smoker    smoker_category 
##                  0                  0                  0                  0 
##             BMXBMI             totmet             metcat            tbmdcat 
##                 11                698                698                  0 
##            calcium               vitd             DSQTVD           DSQTCALC 
##                157                157               1186               1279

#Display cleaned sample size
data.frame(
  Metric = "Sample Size",
  Value = paste(nrow(bmd_cleaned_data), "adults")
  ) %>%
kable()

Metric	Value
Sample Size	2286 adults

Final cleaned sample size is 2286 adults with 3 new variables ethnicity_category, gender_category, smoker_category

#Part 1: One-Way ANOVA

Research Question: Do mean BMD values differ across ethnicity groups

##Step 1: Null Hypothesis (H₀): μ1 = μ2 = μ3 = μ4 = μ5

plain language: Bone mineral density is equal across all five ethnicity categories

Alternative hypothesis(H1): At least one population means differs from the others Significance level: α = 0.05

##Step2:Exploratory analysis

Create table: n, mean, SD of BMD by ethnicity Plot with jittered points showing BMD distributions

# Calculate summary statistics by BMI category
summary_stats <- bmd_cleaned_data %>%
  group_by(ethnicity_category) %>%
  summarise(
    n = n(),
    Mean = mean(DXXOFBMD),
    SD = sd(DXXOFBMD),
    Median = median(DXXOFBMD),
    Min = min(DXXOFBMD),
    Max = max(DXXOFBMD)
  )

summary_stats %>% 
  kable(digits = 2, 
        caption = "Descriptive Statistics: Bone mineral density by ethnicity/race")

Descriptive Statistics: Bone mineral density by ethnicity/race
ethnicity_category	n	Mean	SD	Median	Min	Max
Mexican American	255	0.95	0.15	0.96	0.44	1.46
Other Hispanic	244	0.95	0.16	0.94	0.55	1.39
Non-Hispanic White	846	0.89	0.16	0.88	0.37	1.46
Non-Hispanic Black	532	0.97	0.16	0.98	0.44	1.55
Other	409	0.90	0.15	0.89	0.54	1.34

Observation: The mean BMD appears to slightly lower among Non-Hispanic white(0.9) and Other ethnicity (0.9)

# Create boxplots with individual points
ggplot(bmd_cleaned_data, 
  aes(x = ethnicity_category, y = DXXOFBMD, fill = ethnicity_category)) +
  geom_boxplot(alpha = 0.7, outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.1, size = 0.5) +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = "BMD by ethnicity Category",
    subtitle = "BMD Dataset, Adults aged 18-65",
    x = "ethnicity Category",
    y = "BMD (g/cm²)",
    fill = "ethnicity Category"
  ) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

What the plot tells us:

##Step3: ANOVA F-test

# Fit the one-way ANOVA model
anova_model <- aov(DXXOFBMD ~ ethnicity_category, data = bmd_cleaned_data)

# Display the ANOVA table
summary(anova_model)

##                      Df Sum Sq Mean Sq F value Pr(>F)    
## ethnicity_category    4   2.95  0.7371   30.18 <2e-16 ***
## Residuals          2281  55.70  0.0244                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interpretation:

F-statistic: 30.19
Degrees of freedom: df₁ = 4 (k-1 groups), df₂ = 2281 (n-k)
p-value: < 2e-16 (very small)
Decision: Since p < 0.05, we reject H₀
Conclusion: There is statistically significant evidence that mean BMD differs across at least 2 ethnicity categories.

##Step4: Assumptions check

A: Normality: Residuals are approximately normally distributed B: Homogeneity of variance: Almost equal variances across groups

#Creating Q-Q plot
par(mfrow = c(2, 2), mar = c(2, 2, 1, 1))
plot(anova_model)

#Levene's test for homogeneity of variance
levene_test <- leveneTest(DXXOFBMD ~ ethnicity_category, data = bmd_cleaned_data)
print(levene_test)

## Levene's Test for Homogeneity of Variance (center = median)
##         Df F value Pr(>F)
## group    4  1.5728 0.1788
##       2281

Diagnostic Plot Interpretation: Diagnostic plots suggest that assumptions of linearity, normality, and homoscedasticity are reasonably met. The Q-Q plot shows residuals approximately following a normal distribution. The Scale-Location plot does not indicate substantial heteroscedasticity. A few observations show slightly higher leverage, but no strongly influential points are evident.

Levene’s Test Interpretation: - p-value: 0.179 - If p < 0.05, we would reject equal variances - Here: Equal variance assumption is met

Overall Assessment: With n > 2000, ANOVA is robust to minor violations. Our assumptions are reasonably satisfied.

##Step5: Post-Hoc comparisons

# Conduct Tukey HSD test
tukey_results <- TukeyHSD(anova_model)
print(tukey_results)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = DXXOFBMD ~ ethnicity_category, data = bmd_cleaned_data)
## 
## $ethnicity_category
##                                               diff          lwr         upr
## Other Hispanic-Mexican American       -0.004441273 -0.042644130  0.03376158
## Non-Hispanic White-Mexican American   -0.062377249 -0.092852618 -0.03190188
## Non-Hispanic Black-Mexican American    0.022391619 -0.010100052  0.05488329
## Other-Mexican American                -0.053319344 -0.087357253 -0.01928144
## Non-Hispanic White-Other Hispanic     -0.057935976 -0.088934694 -0.02693726
## Non-Hispanic Black-Other Hispanic      0.026832892 -0.006150151  0.05981593
## Other-Other Hispanic                  -0.048878071 -0.083385341 -0.01437080
## Non-Hispanic Black-Non-Hispanic White  0.084768868  0.061164402  0.10837333
## Other-Non-Hispanic White               0.009057905 -0.016633367  0.03474918
## Other-Non-Hispanic Black              -0.075710963 -0.103764519 -0.04765741
##                                           p adj
## Other Hispanic-Mexican American       0.9978049
## Non-Hispanic White-Mexican American   0.0000003
## Non-Hispanic Black-Mexican American   0.3276613
## Other-Mexican American                0.0001919
## Non-Hispanic White-Other Hispanic     0.0000036
## Non-Hispanic Black-Other Hispanic     0.1722466
## Other-Other Hispanic                  0.0010720
## Non-Hispanic Black-Non-Hispanic White 0.0000000
## Other-Non-Hispanic White              0.8719109
## Other-Non-Hispanic Black              0.0000000

# Visualize the confidence intervals
par(mar = c(5, 16, 4, 2))
plot(tukey_results, las=1)

Interpretation: Statistically significant differences in BMD are among: Non-Hispanic White to Mexican American, Other to Mexican American, Non-Hispanic White to Other Hispanic, Other to Other Hispanic, Non-Hispanic Black to Non-Hispanic White and Other to Non-Hispanic Black because their family-wise Confidence Intervals do not cross the Null value and p <0.05

##Step6: Effect Size

# Extract sum of squares from ANOVA table
anova_summary <- summary(anova_model)[[1]]

ss_treatment <- anova_summary$`Sum Sq`[1]
ss_total <- sum(anova_summary$`Sum Sq`)

# Calculate eta-squared
eta_squared <- ss_treatment / ss_total

cat("Eta-squared (η²):", round(eta_squared, 4), "\n")

## Eta-squared (η²): 0.0503

cat("Percentage of variance explained:", round(eta_squared * 100, 2), "%")

## Percentage of variance explained: 5.03 %

Interpretation: ethnicity category explains 5.03% of the variance in Bone Mineral Density.

Effect size guidelines: Small (0.01), Medium (0.06), Large (0.14)
Our effect: 0.05 While statistically significant, the practical effect is small— ethnicity category alone doesn’t explain most of the variation in Bone mineral density.

#Part2: Correlation analysis:

Research Question: Is there a correlation between ethnicity and BMD among US adults?

##Step1: Create total intake variables:

# Set seed for reproducibility
set.seed(553)

# Creating variable categories and prepare data
bmd_cor_data <- bmd_cleaned_data %>%
  filter(RIDAGEYR >= 18, RIDAGEYR <= 80) %>%  # Adults 18-80
  mutate( 
    calcium_total = (coalesce(calcium,0) + coalesce(DSQTCALC,0)),
        vitd_total = (coalesce(vitd,0) + coalesce(DSQTVD,0))
          )%>%
  select(DXXOFBMD, calcium, vitd, DSQTVD, DSQTCALC, calcium_total, vitd_total,  RIDAGEYR,RIDRETH1, ethnicity_category, RIAGENDR, gender_category, smoker, smoker_category, BMXBMI,totmet, metcat, tbmdcat)
  
# Show first 7 rows
head (bmd_cor_data) %>%
  kable(digits=2, caption = "BMD Dataset with created total intake variables  (first 7 rows)")

BMD Dataset with created total intake variables (first 7 rows)
DXXOFBMD	calcium	vitd	DSQTVD	DSQTCALC	calcium_total	vitd_total	RIDAGEYR	RIDRETH1	ethnicity_category	RIAGENDR	gender_category	smoker	smoker_category	BMXBMI	totmet	metcat	tbmdcat
1.06	503.5	1.85	20.56	211.67	715.17	22.41	66	4	Non-Hispanic Black	2	Female	2	Past	31.7	240	0	0
0.80	473.5	5.85	25.00	820.00	1293.50	30.85	66	5	Other	2	Female	3	Never	23.7	120	0	1
0.88	NA	NA	NA	NA	0.00	0.00	75	4	Non-Hispanic Black	2	Female	1	Current	38.9	720	1	0
0.85	1248.5	3.85	25.00	35.00	1283.50	28.85	56	5	Other	1	Male	3	Never	21.3	840	1	1
0.78	660.5	2.35	NA	13.33	673.83	2.35	67	3	Non-Hispanic White	1	Male	1	Current	23.5	360	0	1
0.99	776.0	5.65	NA	NA	776.00	5.65	54	4	Non-Hispanic Black	2	Female	2	Past	39.9	NA	NA	0

#Exploring the missing values in the dataset
sum(is.na(bmd_cor_data)) # Total missing values

## [1] 4186

colSums(is.na(bmd_cor_data)) # Missing values per column

##           DXXOFBMD            calcium               vitd             DSQTVD 
##                  0                157                157               1186 
##           DSQTCALC      calcium_total         vitd_total           RIDAGEYR 
##               1279                  0                  0                  0 
##           RIDRETH1 ethnicity_category           RIAGENDR    gender_category 
##                  0                  0                  0                  0 
##             smoker    smoker_category             BMXBMI             totmet 
##                  0                  0                 11                698 
##             metcat            tbmdcat 
##                698                  0

We can see that there is no missing values in calcium_total and vitd_total variables

##Step2: Assess and Apply transformations

###Create histograms for BMI, calcium, Vitamin D, MET

set.seed(553)
# Calculate measures
mean_BMI <- mean(bmd_cor_data$BMXBMI, na.rm = TRUE)
median_BMI <- median(bmd_cor_data$BMXBMI, na.rm = TRUE) 

# Visualize
  ggplot(bmd_cor_data, aes(x = BMXBMI)) +
  geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
  geom_vline(aes(xintercept = mean_BMI, color = "Mean"), 
             linewidth = 1.2, linetype = "dashed") +
  geom_vline(aes(xintercept = median_BMI, color = "Median"), 
             linewidth = 1.2, linetype = "dashed") +
  scale_color_manual(values = c("Mean" = "red", "Median" = "blue")) +
  labs(
    title = "BMI Distribution (kg/m²)",
    x = "BMI", y = "Frequency",
    color = "Measure"
  ) +
  theme_minimal()

# Print values
cat("Mean:", round(mean_BMI, 2), "kg/m²")

## Mean: 29 kg/m²

cat("Median:", round(median_BMI, 2), "kg/m²")

## Median: 28.2 kg/m²

set.seed(553)

# Calculate measures
mean_calcium_total <- mean(bmd_cor_data$calcium_total, na.rm = TRUE)
median_calcium_total <- median(bmd_cor_data$calcium_total, na.rm = TRUE) 

# Visualize
  ggplot(bmd_cor_data, aes(x = calcium_total)) +
  geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
  geom_vline(aes(xintercept = mean_calcium_total, color = "Mean"), 
             linewidth = 1.2, linetype = "dashed") +
  geom_vline(aes(xintercept = median_calcium_total, color = "Median"), 
             linewidth = 1.2, linetype = "dashed") +
  scale_color_manual(values = c("Mean" = "red", "Median" = "blue")) +
  labs(
    title = "Total Calcium intake Distribution (mg/day)",
    x = "Total Calcium intake", y = "Frequency",
    color = "Measure"
  ) +
  theme_minimal()

# Print values
cat("Mean:", round(mean_calcium_total, 2), "mg/day")

## Mean: 943.83 mg/day

cat("Median:", round(median_calcium_total, 2), "mg/day²")

## Median: 865.25 mg/day²

set.seed(553)

# Calculate measures
mean_vitd_total <- mean(bmd_cor_data$vitd_total, na.rm = TRUE)
median_vitd_total <- median(bmd_cor_data$vitd_total, na.rm = TRUE) 

# Visualize
  ggplot(bmd_cor_data, aes(x = vitd_total)) +
  geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
  geom_vline(aes(xintercept = mean_vitd_total, color = "Mean"), 
             linewidth = 1.2, linetype = "dashed") +
  geom_vline(aes(xintercept = median_vitd_total, color = "Median"), 
             linewidth = 1.2, linetype = "dashed") +
  scale_color_manual(values = c("Mean" = "red", "Median" = "blue")) +
  labs(
    title = "Total Vitamin D intake Distribution (mcg/day)",
    x = "Total Vitamin D intake", y = "Frequency",
    color = "Measure"
  ) +
  theme_minimal()

# Print values
cat("Mean:", round(mean_vitd_total, 2), "mcg/day")

## Mean: 28.81 mcg/day

cat("Median:", round(median_vitd_total, 2), "mcg/day")

## Median: 8.34 mcg/day

set.seed(553)

# Calculate measures
mean_totmet <- mean(bmd_cor_data$totmet, na.rm = TRUE)
median_totmet <- median(bmd_cor_data$totmet, na.rm = TRUE) 

# Visualize
  ggplot(bmd_cor_data, aes(x = totmet)) +
  geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
  geom_vline(aes(xintercept = mean_totmet, color = "Mean"), 
             linewidth = 1.2, linetype = "dashed") +
  geom_vline(aes(xintercept = median_totmet, color = "Median"), 
             linewidth = 1.2, linetype = "dashed") +
  scale_color_manual(values = c("Mean" = "red", "Median" = "blue")) +
  labs(
    title = "Physical activity Distribution (MET-min/week)",
    x = "Physical activity", y = "Frequency",
    color = "Measure"
  ) +
  theme_minimal()

# Print values
cat("Mean:", round(mean_totmet, 2), "MET-min/week")

## Mean: 1026.47 MET-min/week

cat("Median:", round(median_totmet, 2), "MET-min/week")

## Median: 480 MET-min/week

Decide on trnasformations based on skewness

BMI - almost simmetrical, slightly skewed to the right, means approx. equal to median Totmet - skewed to the right - mean>median Total Calcium intake - skewed to the right - mean>median Total vitamin D intake - skewed to the right - mean>median

Therefore following suggested transformations - BMI, calcium - log, and MET, vitd - Square root

bmd_trans_cor <- bmd_cor_data %>%
  mutate(
    BMI_log = log(BMXBMI + 0.01),         
    calcium_log = log(calcium_total + 0.01),
    MET_sqrt = sqrt(totmet),
    vitd_total_sqrt = sqrt(vitd_total )
        )

set.seed(553)
# Calculate measures
mean_calcium_log <- mean(bmd_trans_cor$calcium_log, na.rm = TRUE)
median_calcium_log <- median(bmd_trans_cor$calcium_log, na.rm = TRUE) 

# Visualize
  ggplot(bmd_trans_cor, aes(x = calcium_log)) +
  geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
  geom_vline(aes(xintercept = mean_calcium_log, color = "Mean"), 
             linewidth = 1.2, linetype = "dashed") +
  geom_vline(aes(xintercept = median_calcium_log, color = "Median"), 
             linewidth = 1.2, linetype = "dashed") +
  scale_color_manual(values = c("Mean" = "red", "Median" = "blue")) +
  labs(
    title = " Transformed Calcium intake Distribution (mg/day)",
    x = "calcium_log", y = "Frequency",
    color = "Measure"
  ) +
  theme_minimal()

# Print values
cat("Mean:", round(mean_calcium_log, 2), "mg/day")

## Mean: 6.25 mg/day

cat("Median:", round(median_calcium_log, 2), "mg/day")

## Median: 6.76 mg/day

set.seed(553)

# Calculate measures
mean_MET_sqrt <- mean(bmd_trans_cor$MET_sqrt, na.rm = TRUE)
median_MET_sqrt <- median(bmd_trans_cor$MET_sqrt, na.rm = TRUE) 

# Visualize
  ggplot(bmd_trans_cor, aes(x = MET_sqrt)) +
  geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
  geom_vline(aes(xintercept = mean_MET_sqrt, color = "Mean"), 
             linewidth = 1.2, linetype = "dashed") +
  geom_vline(aes(xintercept = median_MET_sqrt, color = "Median"), 
             linewidth = 1.2, linetype = "dashed") +
  scale_color_manual(values = c("Mean" = "red", "Median" = "blue")) +
  labs(
    title = "Transformed Physcial activity Distribution (MET-min/Week)",
    x = "Met_sqrt", y = "Frequency",
    color = "Measure"
  ) +
  theme_minimal()

# Print values
cat("Mean:", round(mean_MET_sqrt, 2), "mg/day")

## Mean: 27.38 mg/day

cat("Median:", round(median_MET_sqrt, 2), "mg/day")

## Median: 21.91 mg/day

set.seed(553)

# Calculate measures
mean_vitd_total_sqrt <- mean(bmd_trans_cor$vitd_total_sqrt, na.rm = TRUE)
median_vitd_total_sqrt <- median(bmd_trans_cor$vitd_total_sqrt, na.rm = TRUE) 

# Visualize
  ggplot(bmd_trans_cor, aes(x =vitd_total_sqrt)) +
  geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
  geom_vline(aes(xintercept = mean_vitd_total_sqrt, color = "Mean"), 
             linewidth = 1.2, linetype = "dashed") +
  geom_vline(aes(xintercept = median_vitd_total_sqrt, color = "Median"), 
             linewidth = 1.2, linetype = "dashed") +
  scale_color_manual(values = c("Mean" = "red", "Median" = "blue")) +
  labs(
    title = "Transformed Vitamin D intake Distribution (mcg/day)",
    x = "vitd_total_sqrt", y = "Frequency",
    color = "Measure"
  ) +
  theme_minimal()

# Print values
cat("Mean:", round(mean_vitd_total_sqrt, 2), "mcg/day")

## Mean: 4 mcg/day

cat("Median:", round(median_vitd_total_sqrt, 2), "mcg/day")

## Median: 2.89 mcg/day

#Document reasoning: BMI - almost symmetrical, slightly skewed to the right, means approx. equal to median Totmet - skewed to the right - mean>median Total Calcium intake - skewed to the right - mean>median Total vitamin D intake - skewed to the right - mean>median

##Step3: Three correlation Tables

corr_pair <- function(x, y) {
  test <- cor.test(x, y, method = "pearson")
  data.frame(
    r = as.numeric(test$estimate),
    p_value = test$p.value
  )
}
#list of predictors
predictors <- c("RIDAGEYR", "BMI_log", "calcium_log", "vitd_total_sqrt")

# Table A: Predictors vs Outcome (BMD)
table_A <- map_dfr(predictors, function(var) {
  corr_pair(bmd_trans_cor[[var]], bmd_trans_cor$DXXOFBMD) %>%
    mutate(var1 = var,
           var2 = "DXXOFBMD")
})

cat("\nTable A: Predictors vs Outcome (BMD)\n")

## 
## Table A: Predictors vs Outcome (BMD)

kable(table_A, digits = 3, caption = "Table A: Predictors vs Outcome (BMD)")

Table A: Predictors vs Outcome (BMD)
r	p_value	var1	var2
-0.232	0.000	RIDAGEYR	DXXOFBMD
0.425	0.000	BMI_log	DXXOFBMD
0.029	0.162	calcium_log	DXXOFBMD
-0.051	0.015	vitd_total_sqrt	DXXOFBMD

# ️Table B: Predictors vs Exposure (Physical Activity)
table_B <- map_dfr(predictors, function(var) {
  corr_pair(bmd_trans_cor[[var]], bmd_trans_cor$MET_sqrt) %>%
    mutate(var1 = var,
           var2 = "MET_sqrt")
})

cat("\nTable B: Predictors vs Exposure (Physical Activity)\n")

## 
## Table B: Predictors vs Exposure (Physical Activity)

kable(table_B, digits = 3, caption = "Table B: Predictors vs Exposure (Physical Activity)")

Table B: Predictors vs Exposure (Physical Activity)
r	p_value	var1	var2
-0.105	0.000	RIDAGEYR	MET_sqrt
0.009	0.732	BMI_log	MET_sqrt
0.062	0.014	calcium_log	MET_sqrt
-0.002	0.934	vitd_total_sqrt	MET_sqrt

# Table C: Predictors vs Each Other 
pairs <- combn(predictors, 2, simplify = FALSE)

table_C <- map_dfr(pairs, function(x) {
  corr_pair(bmd_trans_cor[[x[1]]], bmd_trans_cor[[x[2]]]) %>%
    mutate(var1 = x[1],
           var2 = x[2])
})

cat("\nTable C: Predictors vs Each Other\n")

## 
## Table C: Predictors vs Each Other

kable(table_C, digits = 3, caption = "Table C: Predictors vs Each Other")

Table C: Predictors vs Each Other
r	p_value	var1	var2
-0.093	0.000	RIDAGEYR	BMI_log
-0.032	0.124	RIDAGEYR	calcium_log
0.146	0.000	RIDAGEYR	vitd_total_sqrt
0.038	0.071	BMI_log	calcium_log
0.024	0.250	BMI_log	vitd_total_sqrt
0.191	0.000	calcium_log	vitd_total_sqrt

# Q-Q plots for normality
par(mfrow = c(1, 2))

qqnorm(bmd_trans_cor$RIDAGEYR, main = "Q-Q Plot: Age")
qqline(bmd_trans_cor$RIDAGEYR, col = "red")

qqnorm(bmd_trans_cor$DXXOFBMD, main = "Q-Q Plot: BMD")
qqline(bmd_trans_cor$DXXOFBMD, col = "red")

qqnorm(bmd_trans_cor$BMI_log, main = "Q-Q Plot: BMI")
qqline(bmd_trans_cor$BMI_log, col = "red")


qqnorm(bmd_trans_cor$calcium_log, main = "Q-Q Plot: Calcium")
qqline(bmd_trans_cor$calcium_log, col = "red")

qqnorm(bmd_trans_cor$vitd_total_sqrt, main = "Q-Q Plot: VitD")
qqline(bmd_trans_cor$vitd_total_sqrt, col = "red")

qqnorm(bmd_trans_cor$MET_sqrt, main = "Q-Q Plot: Phys Activity")
qqline(bmd_trans_cor$MET_sqrt, col = "red")

par(mfrow = c(1, 1))

###Interpretations: 1)In table A we see statistically significant: - negative weak correlation between Age and Bone Mineral density, in other words there is weak indirect association between persons age and BMD - positive moderate correlation between BMI-log and Bone Mineral density, in other words there is moderate direct association between persons BMI and BMD -Negative very weak correlation between Vitamin D intake and Bone Mineral Density 2) In table B we see statistically significant: - Negative weak correlation between AGE and Physical Activity in met-min per week, in other words there is a weak association between person being older and persons’ physical activity
- Very weak positive correlation between total Calcium intake and Physical Activity in met-min per week. Other associations between predictors and physical activity are not statistically significant. 3)In table C we see statistically significant: - negative weak correlation between AGE and BMI, in other words there is weak association between person being older and person’s BMI being lower - positive very weak correlation between AGE and total vitamin D intake -positive very weak correlation between Total calcium D intake and total vitamin D intake

###Conclusion: Although Q-Q plots shows some deviation from normality for some variables, Person’s Corellation was used because of the large sample size(>2000), which makes the method relatively robust to moderate departures from normality. Adiitionally we could check with Person’s correlation

corr_pair_spearman <- function(x, y) {
  test <- cor.test(x, y, method = "spearman", exact = FALSE)
  
  data.frame(
    rho = as.numeric(test$estimate),
    p_value = test$p.value
  )
}
#list of predictors
predictors <- c("RIDAGEYR", "BMI_log", "calcium_log", "vitd_total_sqrt")

# Table A: Predictors vs Outcome (BMD)
table_A_spear <- map_dfr(predictors, function(var) {
  corr_pair_spearman(
    bmd_trans_cor[[var]],
    bmd_trans_cor$DXXOFBMD
  ) %>%
    mutate(
      predictor = var,
      outcome = "DXXOFBMD"
    )
})

kable(table_A_spear, digits = 3,
      caption = "Spearman Correlation: Predictors vs BMD Outcome")

Spearman Correlation: Predictors vs BMD Outcome
rho	p_value	predictor	outcome
-0.214	0.000	RIDAGEYR	DXXOFBMD
0.402	0.000	BMI_log	DXXOFBMD
0.023	0.264	calcium_log	DXXOFBMD
-0.064	0.002	vitd_total_sqrt	DXXOFBMD

# ️Table B: Predictors vs Exposure (Physical Activity)
table_B_spear <- map_dfr(predictors, function(var) {
  corr_pair_spearman(
    bmd_trans_cor[[var]],
    bmd_trans_cor$MET_sqrt
  ) %>%
    mutate(
      predictor = var,
      outcome = "MET_sqrt"
    )
})

kable(table_B_spear, digits = 3,
      caption = "Spearman Correlation: Predictors vs Physical Activity")

Spearman Correlation: Predictors vs Physical Activity
rho	p_value	predictor	outcome
-0.095	0.000	RIDAGEYR	MET_sqrt
0.030	0.237	BMI_log	MET_sqrt
0.092	0.000	calcium_log	MET_sqrt
-0.003	0.889	vitd_total_sqrt	MET_sqrt

# Table C: Predictors vs Each Other 
pairs <- combn(predictors, 2, simplify = FALSE)

table_C_spear <- map_dfr(pairs, function(x) {
  corr_pair_spearman(
    bmd_trans_cor[[x[1]]],
    bmd_trans_cor[[x[2]]]
  ) %>%
    mutate(
      variable1 = x[1],
      variable2 = x[2]
    )
})

kable(table_C_spear, digits = 3,
      caption = "Spearman Correlation: Predictors vs Each Other")

Spearman Correlation: Predictors vs Each Other
rho	p_value	variable1	variable2
-0.084	0.000	RIDAGEYR	BMI_log
0.038	0.069	RIDAGEYR	calcium_log
0.190	0.000	RIDAGEYR	vitd_total_sqrt
0.023	0.265	BMI_log	calcium_log
-0.012	0.578	BMI_log	vitd_total_sqrt
0.452	0.000	calcium_log	vitd_total_sqrt

###Interpretations:

Using Spearman’s correlation slightly changes the numbers, but overall significance, directions and strength did not change for all variables.

#Part3

##A: Comparing Methods.

1)We use ANOVA when we need to detect the differences between more than 2 independent samples, for example in our case we have compared BMD between 5 ethnicity groups. Example of research question: if we have changed AGE from continuous variable to categories - younger adults, adults-45-65, older adults >65, we could have a research question- Is there a difference in BMd across all the group categories. 2) We use correlation to see any association that is statistically significant among multiple variables in one sample. Although correlation is not causation, it might give information what should be explored in prospective cohort studies or RCt if we want to determine temporality and causality. Example of research question: Is there any correlation between sugar intake and BMI of a person? As we checked QQ plots for normality of the data we observe some deviation from normality on couple of variables, after calculation there were no drastic difference in results.

##B: Assumptions and Limitations:

As we have rather large sample size (>2000) I basically almost ignored outliers, because ANOVA is robust to outliers. Pearson on the other hand is sensitive for outliers. It would be right to check for outliers for Correlation calculation part. So Q-Q plots show some outliers on variables and deviation from normality in data, therefore I used additionally Spearman’s correlation. Of course limitation of correlation in cross-sectional study is that exposure, predictors and outcome is measured in one point of time, therefore we can not prove temporality. And correlation does not mean caution, we should be very careful with interpretations ans conclusions. Most of the cross-sectional studies conclude - we need to conduct more longitudinal studies to check causality of exposure and outcome variables. In addition, unlike regression analysis, correlation analysis does not adjust for potential confounding variables. Therefore, observed associations may be influenced by other unmeasured factors. Even though multicollinearity diagnostics were considered in the analysis, correlation results should still be interpreted cautiously.

##C: R programming dificulties:

This is the first time when I had to code from the beginning to the end. The most difficult part for me was setting datasets and mutating variables in the begging. I realized that I did not pay attention to is.na,na.omit() and other small but crucial logical codes in R. -But the most difficult part was correlation because unlike anova it requires to code and write a lot of lines. It depends on the number of the variables I guess and we had a lot of predictors and exposure variables to check. And specifically three correlations tables was the most difficult part as it took half of this day and was quite a headache. corr_pair <- function(x, y) { test <- cor.test(x, y, method = “pearson”) data.frame( r = as.numeric(test\(estimate), p_value = test\)p.value )

Epi553_HW01_Azamatova_Aigerim

Azamatova_Aigerim

2026-02-20