# Load required packages
install.packages("tidyverse")
install.packages("car")
install.packages("kableExtra")
install.packages("broom")
install.packages("rmarkdown")
setwd("C:\\Users\\userp\\OneDrive\\Рабочий стол\\HW1_HEPI553 Nursultan")
bmd <- read.csv("bmd.csv")
head(bmd)
##    SEQN RIAGENDR RIDAGEYR RIDRETH1 BMXBMI smoker totmet metcat DXXOFBMD tbmdcat
## 1 93705        2       66        4   31.7      2    240      0    1.058       0
## 2 93708        2       66        5   23.7      3    120      0    0.801       1
## 3 93709        2       75        4   38.9      1    720      1    0.880       0
## 4 93711        1       56        5   21.3      3    840      1    0.851       1
## 5 93713        1       67        3   23.5      1    360      0    0.778       1
## 6 93714        2       54        4   39.9      2     NA     NA    0.994       0
##   calcium vitd DSQTVD DSQTCALC
## 1   503.5 1.85 20.557   211.67
## 2   473.5 5.85 25.000   820.00
## 3      NA   NA     NA       NA
## 4  1248.5 3.85 25.000    35.00
## 5   660.5 2.35     NA    13.33
## 6   776.0 5.65     NA       NA
str(bmd)
## 'data.frame':    2898 obs. of  14 variables:
##  $ SEQN    : int  93705 93708 93709 93711 93713 93714 93715 93716 93721 93722 ...
##  $ RIAGENDR: int  2 2 2 1 1 2 1 1 2 2 ...
##  $ RIDAGEYR: int  66 66 75 56 67 54 71 61 60 60 ...
##  $ RIDRETH1: int  4 5 4 5 3 4 5 5 1 3 ...
##  $ BMXBMI  : num  31.7 23.7 38.9 21.3 23.5 39.9 22.5 30.7 35.9 23.8 ...
##  $ smoker  : int  2 3 1 3 1 2 1 2 3 3 ...
##  $ totmet  : int  240 120 720 840 360 NA 6320 2400 NA NA ...
##  $ metcat  : int  0 0 1 1 0 NA 2 2 NA NA ...
##  $ DXXOFBMD: num  1.058 0.801 0.88 0.851 0.778 ...
##  $ tbmdcat : int  0 1 0 1 1 0 0 0 NA 1 ...
##  $ calcium : num  504 474 NA 1248 660 ...
##  $ vitd    : num  1.85 5.85 NA 3.85 2.35 5.65 3.75 4.45 6.05 6.45 ...
##  $ DSQTVD  : num  20.6 25 NA 25 NA ...
##  $ DSQTCALC: num  211.7 820 NA 35 13.3 ...

#PART 0


bmd <- bmd %>%
  mutate(
    ethnicity = factor(RIDRETH1,
                       levels = c(1, 2, 3, 4, 5),
                       labels = c(
                         "Mexican American",
                         "Other Hispanic",
                         "Non-Hispanic White",
                         "Non-Hispanic Black",
                         "Other"
                       )
    ),
    sex = factor(RIAGENDR,
                 levels = c(1, 2),
                 labels = c("Male", "Female")
    ),
    smoker = factor(smoker,
                    levels = c(1, 2, 3),
                    labels = c("Current", "Past", "Never")
    )
  )

nrow(bmd)
## [1] 2898
sum(is.na(bmd$DXXOFBMD))
## [1] 612
sum(is.na(bmd$ethnicity))
## [1] 0
anova_df <- bmd %>%
  filter(!is.na(DXXOFBMD), !is.na(ethnicity))
nrow(anova_df)
## [1] 2286

#PART 1 – ANOVA #Research Question: Do mean BMD values differ across ethnicity groups? ##Step1 #μ₁ = mean BMD for Mexican American; μ₂ = mean BMD for Other Hispanic; μ₃ = mean BMD for Non-Hispanic White; μ₄ = mean BMD for Non-Hispanic Black; μ₅ = mean BMD for Other #Null hypothesis (H₀): All group means are equal. Null hypothesis: There is no difference in average bone mineral density across the five ethnic groups. #Alternative hypothesis (H₁): At least one group mean is different.Alternative hypothesis: At least one ethnic group has a different average bone mineral density compared with the others. ##Step2: Exploratory Analysis

summary_table <- anova_df %>%
  group_by(ethnicity) %>%
  summarise(
    n = n(),
    mean_BMD = mean(DXXOFBMD, na.rm = TRUE),
    sd_BMD = sd(DXXOFBMD, na.rm = TRUE)
  )

summary_table
## # A tibble: 5 × 4
##   ethnicity              n mean_BMD sd_BMD
##   <fct>              <int>    <dbl>  <dbl>
## 1 Mexican American     255    0.950  0.149
## 2 Other Hispanic       244    0.946  0.156
## 3 Non-Hispanic White   846    0.888  0.160
## 4 Non-Hispanic Black   532    0.973  0.161
## 5 Other                409    0.897  0.148
summary_table %>%
  kable(digits = 3,
        col.names = c("Ethnicity", "N", "Mean BMD", "SD BMD")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Ethnicity N Mean BMD SD BMD
Mexican American 255 0.950 0.149
Other Hispanic 244 0.946 0.156
Non-Hispanic White 846 0.888 0.160
Non-Hispanic Black 532 0.973 0.161
Other 409 0.897 0.148
ggplot(anova_df, aes(x = ethnicity, y = DXXOFBMD)) +
  geom_boxplot() +
  labs(
    title = "Bone Mineral Density by Ethnicity",
    x = "Ethnicity",
    y = "Bone Mineral Density (g/cm²)"
  ) +
  theme_minimal()

#The descriptive statistics show differences in mean bone mineral density across ethnic groups. Some groups appear to have higher average BMD and wider variability than others. The boxplot suggests potential differences between groups, which will be formally tested using ANOVA.

#Step 3: ANOVA F-test

anova_model <- aov(DXXOFBMD ~ ethnicity, data = anova_df)
summary(anova_model)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## ethnicity      4   2.95  0.7371   30.18 <2e-16 ***
## Residuals   2281  55.70  0.0244                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#The one-way ANOVA showed a statistically significant difference in mean bone mineral dnsity across ethnic groups, F(4, 2281) = 30.18, p < 0.001. #Therefore, we reject the null hypothesis and conclude that at least one ethnic group has a different mean bone mineral density compared with the others.

#Step 4: Assumption Checks # Q-Q plot

qqnorm(residuals(anova_model))
qqline(residuals(anova_model))

#A Q-Q plot of the residuals was examined to assess the normality assumption. The residuals appeared to follow the reference line reasonably well, suggesting that the normality assumption was approximately satisfied. # Levene’s test

leveneTest(DXXOFBMD ~ ethnicity, data = anova_df)
## Levene's Test for Homogeneity of Variance (center = median)
##         Df F value Pr(>F)
## group    4  1.5728 0.1788
##       2281

#Levene’s test was conducted to assess equality of variances across ethnic groups. The test was not statistically significant (F(4, 2281) = 1.57, p = 0.179), indicating that the assumption of equal variances was met.

#Step 5: Post-hoc Comparisons

TukeyHSD(anova_model)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = DXXOFBMD ~ ethnicity, data = anova_df)
## 
## $ethnicity
##                                               diff          lwr         upr
## Other Hispanic-Mexican American       -0.004441273 -0.042644130  0.03376158
## Non-Hispanic White-Mexican American   -0.062377249 -0.092852618 -0.03190188
## Non-Hispanic Black-Mexican American    0.022391619 -0.010100052  0.05488329
## Other-Mexican American                -0.053319344 -0.087357253 -0.01928144
## Non-Hispanic White-Other Hispanic     -0.057935976 -0.088934694 -0.02693726
## Non-Hispanic Black-Other Hispanic      0.026832892 -0.006150151  0.05981593
## Other-Other Hispanic                  -0.048878071 -0.083385341 -0.01437080
## Non-Hispanic Black-Non-Hispanic White  0.084768868  0.061164402  0.10837333
## Other-Non-Hispanic White               0.009057905 -0.016633367  0.03474918
## Other-Non-Hispanic Black              -0.075710963 -0.103764519 -0.04765741
##                                           p adj
## Other Hispanic-Mexican American       0.9978049
## Non-Hispanic White-Mexican American   0.0000003
## Non-Hispanic Black-Mexican American   0.3276613
## Other-Mexican American                0.0001919
## Non-Hispanic White-Other Hispanic     0.0000036
## Non-Hispanic Black-Other Hispanic     0.1722466
## Other-Other Hispanic                  0.0010720
## Non-Hispanic Black-Non-Hispanic White 0.0000000
## Other-Non-Hispanic White              0.8719109
## Other-Non-Hispanic Black              0.0000000

#Tukey’s post-hoc test was conducted to identify which ethnic groups differed significantly in mean bone mineral density. The results showed several statistically significant differences between groups. #Non-Hispanic White participants had significantly lower mean BMD than Mexican American participants (p < 0.001) and Other Hispanic participants (p < 0.001). The “Other” ethnic group also had significantly lower mean BMD than both Mexican American (p < 0.001) and Other Hispanic participants (p = 0.001). #In contrast, Non-Hispanic Black participants had significantly higher mean BMD than Non-Hispanic White participants (p < 0.001). Additionally, the “Other” group had significantly lower mean BMD than Non-Hispanic Black participants (p < 0.001). #There were no statistically significant differences between Mexican American and Other Hispanic groups, or between Mexican American and Non-Hispanic Black groups. #Overall, these results indicate that the significant ANOVA finding is driven primarily by differences involving Non-Hispanic White, Non-Hispanic Black, and the “Other” ethnic group.

#Step 6: Effect Size #ANOVA table: SS between (ethnicity) = 2.95; SS residual = 55.70; SS total = 2.95 + 55.70 = 58.65 #Eta-squared calculation 𝜂2= 2.95/58.65≈0.05 #The effect size (η² ≈ 0.05) indicates a small to medium effect, meaning ethnicity explains about 5% of the variation in bone mineral density.


##PART 2: Correlation Analysis

#Step 1: Create Total Intake Variables

anova_df <- anova_df %>%
  mutate(
    calcium_total = calcium + ifelse(is.na(DSQTCALC), 0, DSQTCALC),
    vitd_total = vitd + ifelse(is.na(DSQTVD), 0, DSQTVD)
  )
summary(anova_df$calcium_total)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    55.0   604.5   901.5  1001.5  1288.5  5399.0     157
summary(anova_df$vitd_total)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    2.95    8.60   29.12   30.40 2574.65     157

#Step 2: Assess and Apply Transformations

anova_df <- anova_df %>%
  mutate(
    log_BMI = log(BMXBMI),
    log_calcium = log(calcium_total + 1),
    sqrt_vitd = sqrt(vitd_total),
    sqrt_MET = sqrt(totmet)
  )

#Step 3: Three Correlation Tables #TABLE A

cor.test(anova_df$RIDAGEYR, anova_df$DXXOFBMD)
## 
##  Pearson's product-moment correlation
## 
## data:  anova_df$RIDAGEYR and anova_df$DXXOFBMD
## t = -11.42, df = 2284, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2708250 -0.1932528
## sample estimates:
##        cor 
## -0.2324085
cor.test(anova_df$log_BMI, anova_df$DXXOFBMD)
## 
##  Pearson's product-moment correlation
## 
## data:  anova_df$log_BMI and anova_df$DXXOFBMD
## t = 22.368, df = 2273, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3904627 0.4578472
## sample estimates:
##      cor 
## 0.424743
cor.test(anova_df$log_calcium, anova_df$DXXOFBMD)
## 
##  Pearson's product-moment correlation
## 
## data:  anova_df$log_calcium and anova_df$DXXOFBMD
## t = 0.53621, df = 2127, p-value = 0.5919
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03087143  0.05408111
## sample estimates:
##        cor 
## 0.01162582
cor.test(anova_df$sqrt_vitd, anova_df$DXXOFBMD)
## 
##  Pearson's product-moment correlation
## 
## data:  anova_df$sqrt_vitd and anova_df$DXXOFBMD
## t = -2.8616, df = 2127, p-value = 0.004256
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1041374 -0.0194987
## sample estimates:
##         cor 
## -0.06192941

Table A: Correlation of Predictors with BMD

#| Predictor | r | p-value | n | # |———–|—|———|—| # | Age vs BMD | -0.232 | <0.001 | 2286 | # | Log(BMI) vs BMD | 0.425 | <0.001 | 2275 | # | Log(Calcium) vs BMD | 0.012 | 0.592 | 2129 | # | Sqrt(Vitamin D) vs BMD | -0.062 | 0.004 | 2129 |

#Age showed a moderate negative correlation with bone mineral density, indicating that BMD tends to decrease as age increases. #Log-transformed BMI showed a moderate positive correlation with BMD, suggesting higher BMI is associated with higher bone mineral density. #Total calcium intake showed no significant correlation with BMD. #Vitamin D intake had a weak but statistically significant negative correlation with BMD, although the strength of this association was very small.

#TABLE B

cor.test(anova_df$RIDAGEYR, anova_df$sqrt_MET)
## 
##  Pearson's product-moment correlation
## 
## data:  anova_df$RIDAGEYR and anova_df$sqrt_MET
## t = -4.2121, df = 1586, p-value = 2.672e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.15357551 -0.05627992
## sample estimates:
##        cor 
## -0.1051794
cor.test(anova_df$log_BMI, anova_df$sqrt_MET)
## 
##  Pearson's product-moment correlation
## 
## data:  anova_df$log_BMI and anova_df$sqrt_MET
## t = 0.34299, df = 1580, p-value = 0.7317
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.04067274  0.05788773
## sample estimates:
##         cor 
## 0.008628455
cor.test(anova_df$log_calcium, anova_df$sqrt_MET)
## 
##  Pearson's product-moment correlation
## 
## data:  anova_df$log_calcium and anova_df$sqrt_MET
## t = 2.625, df = 1498, p-value = 0.008752
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.01711285 0.11787742
## sample estimates:
##        cor 
## 0.06766769
cor.test(anova_df$sqrt_vitd, anova_df$sqrt_MET)
## 
##  Pearson's product-moment correlation
## 
## data:  anova_df$sqrt_vitd and anova_df$sqrt_MET
## t = -0.13491, df = 1498, p-value = 0.8927
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.05408958  0.04713609
## sample estimates:
##          cor 
## -0.003485677

###Table B: Correlation of Predictors with Physical Activity # | Predictor | r | p-value | n | # | ———————- | —— | ——- | —- | # | Age vs MET | -0.105 | <0.001 | 1588 | # | Log(BMI) vs MET | 0.009 | 0.732 | 1582 | # | Log(Calcium) vs MET | 0.068 | 0.009 | 1500 | # | Sqrt(Vitamin D) vs MET | -0.003 | 0.893 | 1500 |

#Age showed a weak but statistically significant negative correlation with physical activity, indicating that activity levels tended to decrease slightly with increasing age. #Log-transformed BMI showed no significant correlation with physical activity. #Total calcium intake had a very weak but statistically significant positive correlation with physical activity. #Vitamin D intake showed no significant association with physical activity.

#TABLE C

cor.test(anova_df$RIDAGEYR, anova_df$log_BMI)
## 
##  Pearson's product-moment correlation
## 
## data:  anova_df$RIDAGEYR and anova_df$log_BMI
## t = -4.4434, df = 2273, p-value = 9.282e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.13338420 -0.05189885
## sample estimates:
##        cor 
## -0.0927969
cor.test(anova_df$RIDAGEYR, anova_df$log_calcium)
## 
##  Pearson's product-moment correlation
## 
## data:  anova_df$RIDAGEYR and anova_df$log_calcium
## t = 3.0798, df = 2127, p-value = 0.002098
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.0242168 0.1088043
## sample estimates:
##        cor 
## 0.06663025
cor.test(anova_df$RIDAGEYR, anova_df$sqrt_vitd)
## 
##  Pearson's product-moment correlation
## 
## data:  anova_df$RIDAGEYR and anova_df$sqrt_vitd
## t = 7.0749, df = 2127, p-value = 2.021e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1098568 0.1928708
## sample estimates:
##       cor 
## 0.1516312
cor.test(anova_df$log_BMI, anova_df$log_calcium)
## 
##  Pearson's product-moment correlation
## 
## data:  anova_df$log_BMI and anova_df$log_calcium
## t = 0.40429, df = 2120, p-value = 0.686
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03378442  0.05131310
## sample estimates:
##         cor 
## 0.008780236
cor.test(anova_df$log_BMI, anova_df$sqrt_vitd)
## 
##  Pearson's product-moment correlation
## 
## data:  anova_df$log_BMI and anova_df$sqrt_vitd
## t = 1.2121, df = 2120, p-value = 0.2256
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01625345  0.06879178
## sample estimates:
##        cor 
## 0.02631678
cor.test(anova_df$log_calcium, anova_df$sqrt_vitd)
## 
##  Pearson's product-moment correlation
## 
## data:  anova_df$log_calcium and anova_df$sqrt_vitd
## t = 14.048, df = 2127, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2520132 0.3297757
## sample estimates:
##       cor 
## 0.2913758

###Table C: Correlation Among Predictors # | Predictor Pair | r | p-value | n | # | ——————————- | —— | ——- | —- | # | Age vs Log(BMI) | -0.093 | <0.001 | 2275 | # | Age vs Log(Calcium) | 0.067 | 0.002 | 2129 | # | Age vs Sqrt(Vitamin D) | 0.152 | <0.001 | 2129 | # | Log(BMI) vs Log(Calcium) | 0.009 | 0.686 | 2122 | # | Log(BMI) vs Sqrt(Vitamin D) | 0.026 | 0.226 | 2122 | # | Log(Calcium) vs Sqrt(Vitamin D) | 0.291 | <0.001 | 2129 |

#Most predictors showed weak correlations with each other. #Age had a weak negative correlation with BMI and weak positive correlations with both calcium and vitamin D intake. #BMI was not significantly correlated with either calcium or vitamin D intake. #The strongest association was observed between calcium and vitamin D intake, which showed a moderate positive correlation, suggesting that individuals with higher calcium intake also tended to have higher vitamin D intake.


#Part 3: Reflection

#ANOVA and correlation are used for different types of research questions and serve different purposes in statistical analysis. #ANOVA is appropriate when comparing the mean of a continuous outcome across two or more groups defined by a categorical variable. #In this assignment, ANOVA was used to determine whether mean bone mineral density differed across ethnic groups. #In contrast, correlation is used to measure the strength and direction of the linear relationship between two continuous variables. #For example, correlation can be used to examine the relationship between age and bone mineral density or between BMI and physical activity. #While ANOVA identifies differences between group means, correlation assesses associations between variables without grouping.

#One of the important assumptions in ANOVA is the homogeneity of variances, and in correlation analysis, the assumptions include linear relationships and approximate normality of variables. #Checking these assumptions was an important step in the analysis. If the assumption of equal variances is violated, the ANOVA results may become unreliable and lead to incorrect conclusions. #Similarly, if variables have strong skewness or non-linear relationships, correlation coefficients may not accurately represent the association. #Another key limitation is that this dataset is cross-sectional, meaning that correlations cannot be interpreted as causal relationships. #The associations observed only reflect relationships at one point in time and do not establish cause-and-effect.

#The most challenging part of the assignment was preparing the data and applying the correct transformations before running the correlation analysis. #It required careful inspection of variable distributions and handling missing values appropriately. I addressed this challenge by reviewing the histograms and applying log or square-root transformations as recommended. #This process strengthened my understanding of data cleaning, transformation, and the importance of checking assumptions before conducting statistical analyses.