{r knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
## Warning: package 'readxl' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'gtsummary' was built under R version 4.4.3
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
## Warning: package 'broom' was built under R version 4.4.3
# Load the package
library(readxl)
# Read the Excel file
bmd_data <- read_excel("D:/fizza documents/Epi 553/Assignments/HW02/bmd.xlsx")
# Check the data
head(bmd_data)## # A tibble: 6 × 14
## SEQN RIAGENDR RIDAGEYR RIDRETH1 BMXBMI smoker totmet metcat DXXOFBMD tbmdcat
## <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 93705 2 66 4 31.7 2 240 0 1.058000… 0
## 2 93708 2 66 5 23.7 3 120 0 0.801000… 1
## 3 93709 2 75 4 38.9 1 720 1 0.88 0
## 4 93711 1 56 5 21.3 3 840 1 0.850999… 1
## 5 93713 1 67 3 23.5 1 360 0 0.778000… 1
## 6 93714 2 54 4 39.9 2 NA NA 0.993999… 0
## # ℹ 4 more variables: calcium <chr>, vitd <chr>, DSQTVD <chr>, DSQTCALC <chr>
## [1] "SEQN" "RIAGENDR" "RIDAGEYR" "RIDRETH1" "BMXBMI" "smoker"
## [7] "totmet" "metcat" "DXXOFBMD" "tbmdcat" "calcium" "vitd"
## [13] "DSQTVD" "DSQTCALC"
## SEQN RIAGENDR RIDAGEYR RIDRETH1
## Min. : 93705 Min. :1.00 Min. :50.0 Min. :1.00
## 1st Qu.: 95902 1st Qu.:1.00 1st Qu.:58.0 1st Qu.:3.00
## Median : 98233 Median :2.00 Median :64.0 Median :3.00
## Mean : 98257 Mean :1.51 Mean :65.2 Mean :3.26
## 3rd Qu.:100581 3rd Qu.:2.00 3rd Qu.:73.0 3rd Qu.:4.00
## Max. :102952 Max. :2.00 Max. :80.0 Max. :5.00
## BMXBMI smoker totmet metcat
## Length:2898 Length:2898 Length:2898 Length:2898
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## DXXOFBMD tbmdcat calcium vitd
## Length:2898 Length:2898 Length:2898 Length:2898
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## DSQTVD DSQTCALC
## Length:2898 Length:2898
## Class :character Class :character
## Mode :character Mode :character
##
##
##
## Rows: 2,898
## Columns: 14
## $ SEQN <dbl> 93705, 93708, 93709, 93711, 93713, 93714, 93715, 93716, 93721…
## $ RIAGENDR <dbl> 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2…
## $ RIDAGEYR <dbl> 66, 66, 75, 56, 67, 54, 71, 61, 60, 60, 64, 67, 70, 53, 57, 7…
## $ RIDRETH1 <dbl> 4, 5, 4, 5, 3, 4, 5, 5, 1, 3, 3, 1, 5, 4, 2, 3, 2, 4, 4, 3, 3…
## $ BMXBMI <chr> "31.7", "23.7", "38.9", "21.3", "23.5", "39.9", "22.5", "30.7…
## $ smoker <chr> "2", "3", "1", "3", "1", "2", "1", "2", "3", "3", "3", "2", "…
## $ totmet <chr> "240", "120", "720", "840", "360", "NA", "6320", "2400", "NA"…
## $ metcat <chr> "0", "0", "1", "1", "0", "NA", "2", "2", "NA", "NA", "2", "0"…
## $ DXXOFBMD <chr> "1.0580000000000001", "0.80100000000000005", "0.88", "0.85099…
## $ tbmdcat <chr> "0", "1", "0", "1", "1", "0", "0", "0", "NA", "1", "0", "0", …
## $ calcium <chr> "503.5", "473.5", "NA", "1248.5", "660.5", "776", "452", "853…
## $ vitd <chr> "1.85", "5.85", "NA", "3.85", "2.35", "5.65", "3.75", "4.45",…
## $ DSQTVD <chr> "20.556999999999999", "25", "NA", "25", "NA", "NA", "NA", "10…
## $ DSQTCALC <chr> "211.67", "820", "NA", "35", "13.33", "NA", "26.67", "1066.67…
##STEP 3: Data Cleaning and Type Conversion
bmd_clean <- bmd %>%
mutate(
# Convert demographic variables
SEQN = as.numeric(SEQN),
RIAGENDR = factor(RIAGENDR, levels = c(1, 2),
labels = c("Male", "Female")),
RIDAGEYR = as.numeric(RIDAGEYR),
RIDRETH1 = factor(RIDRETH1,
levels = c(1, 2, 3, 4, 5),
labels = c("Mexican American",
"Other Hispanic",
"Non-Hispanic White",
"Non-Hispanic Black",
"Other Race")),
# Convert health variables to numeric
BMXBMI = as.numeric(BMXBMI),
totmet = as.numeric(totmet),
DXXOFBMD = as.numeric(DXXOFBMD),
# Convert categorical variables
smoker = factor(smoker),
metcat = factor(metcat),
tbmdcat = factor(tbmdcat),
# Convert nutrient variables
calcium = as.numeric(calcium),
vitd = as.numeric(vitd),
DSQTVD = as.numeric(DSQTVD),
DSQTCALC = as.numeric(DSQTCALC)
)## Warning: There were 7 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `BMXBMI = as.numeric(BMXBMI)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 6 remaining warnings.
## === CLEANED DATA STRUCTURE ===
## Rows: 2,898
## Columns: 14
## $ SEQN <dbl> 93705, 93708, 93709, 93711, 93713, 93714, 93715, 93716, 93721…
## $ RIAGENDR <fct> Female, Female, Female, Male, Male, Female, Male, Male, Femal…
## $ RIDAGEYR <dbl> 66, 66, 75, 56, 67, 54, 71, 61, 60, 60, 64, 67, 70, 53, 57, 7…
## $ RIDRETH1 <fct> Non-Hispanic Black, Other Race, Non-Hispanic Black, Other Rac…
## $ BMXBMI <dbl> 31.7, 23.7, 38.9, 21.3, 23.5, 39.9, 22.5, 30.7, 35.9, 23.8, 2…
## $ smoker <fct> 2, 3, 1, 3, 1, 2, 1, 2, 3, 3, 3, 2, 2, 3, 3, 2, 3, 1, 2, 1, 1…
## $ totmet <dbl> 240, 120, 720, 840, 360, NA, 6320, 2400, NA, NA, 1680, 240, 4…
## $ metcat <fct> 0, 0, 1, 1, 0, NA, 2, 2, NA, NA, 2, 0, 0, 0, 1, NA, 0, NA, 1,…
## $ DXXOFBMD <dbl> 1.058, 0.801, 0.880, 0.851, 0.778, 0.994, 0.952, 1.121, NA, 0…
## $ tbmdcat <fct> 0, 1, 0, 1, 1, 0, 0, 0, NA, 1, 0, 0, 1, 0, 0, 1, NA, NA, 0, N…
## $ calcium <dbl> 504, 474, NA, 1248, 660, 776, 452, 854, 929, 1184, 752, 570, …
## $ vitd <dbl> 1.85, 5.85, NA, 3.85, 2.35, 5.65, 3.75, 4.45, 6.05, 6.45, 3.3…
## $ DSQTVD <dbl> 20.6, 25.0, NA, 25.0, NA, NA, NA, 100.0, 50.0, 46.7, 125.0, 2…
## $ DSQTCALC <dbl> 211.7, 820.0, NA, 35.0, 13.3, NA, 26.7, 1066.7, 35.0, 133.3, …
##
## === SUMMARY STATISTICS ===
## SEQN RIAGENDR RIDAGEYR RIDRETH1
## Min. : 93705 Male :1431 Min. :50.0 Mexican American : 331
## 1st Qu.: 95902 Female:1467 1st Qu.:58.0 Other Hispanic : 286
## Median : 98233 Median :64.0 Non-Hispanic White:1086
## Mean : 98257 Mean :65.2 Non-Hispanic Black: 696
## 3rd Qu.:100581 3rd Qu.:73.0 Other Race : 499
## Max. :102952 Max. :80.0
##
## BMXBMI smoker totmet metcat DXXOFBMD tbmdcat
## Min. :14.2 1 : 449 Min. : 0 0 :996 Min. :0 0 :1445
## 1st Qu.:25.2 2 : 894 1st Qu.: 240 1 :513 1st Qu.:1 1 : 771
## Median :28.7 3 :1553 Median : 480 2 :425 Median :1 2 : 70
## Mean :29.8 NA: 2 Mean :1015 NA:964 Mean :1 NA: 612
## 3rd Qu.:33.4 3rd Qu.:1430 3rd Qu.:1
## Max. :84.4 Max. :9120 Max. :2
## NA's :64 NA's :964 NA's :612
## calcium vitd DSQTVD DSQTCALC
## Min. : 55 Min. : 0.0 Min. : 0 Min. : 1
## 1st Qu.: 525 1st Qu.: 1.8 1st Qu.: 16 1st Qu.: 93
## Median : 773 Median : 3.2 Median : 25 Median : 212
## Mean : 840 Mean : 4.5 Mean : 50 Mean : 358
## 3rd Qu.:1049 3rd Qu.: 5.5 3rd Qu.: 50 3rd Qu.: 500
## Max. :5199 Max. :57.9 Max. :2570 Max. :3220
## NA's :293 NA's :293 NA's :1515 NA's :1633
##STEP 4: Create Derived Variables
bmd_analysis <- bmd_clean %>%
mutate(
# Age groups
age_group = case_when(
RIDAGEYR >= 50 & RIDAGEYR < 60 ~ "50-59 years",
RIDAGEYR >= 60 & RIDAGEYR < 70 ~ "60-69 years",
RIDAGEYR >= 70 ~ "70+ years",
TRUE ~ NA_character_
),
# BMI categories
bmi_category = case_when(
BMXBMI < 18.5 ~ "Underweight",
BMXBMI >= 18.5 & BMXBMI < 25 ~ "Normal",
BMXBMI >= 25 & BMXBMI < 30 ~ "Overweight",
BMXBMI >= 30 ~ "Obese",
TRUE ~ NA_character_
),
# Physical activity level
pa_level = case_when(
totmet < 500 ~ "Sedentary",
totmet >= 500 & totmet < 1000 ~ "Moderately active",
totmet >= 1000 ~ "Highly active",
TRUE ~ NA_character_
),
# Vitamin D deficiency
vitd_deficient = case_when(
vitd < 12 ~ "Deficient",
vitd >= 12 & vitd < 20 ~ "Insufficient",
vitd >= 20 ~ "Sufficient",
TRUE ~ NA_character_
),
# Low BMD (osteoporosis)
low_bmd = case_when(
DXXOFBMD < 0.6 ~ "Low BMD",
DXXOFBMD >= 0.6 ~ "Normal BMD",
TRUE ~ NA_character_
)
) %>%
# Convert new variables to factors
mutate(
age_group = factor(age_group, levels = c("50-59 years", "60-69 years", "70+ years")),
bmi_category = factor(bmi_category, levels = c("Underweight", "Normal", "Overweight", "Obese")),
pa_level = factor(pa_level, levels = c("Sedentary", "Moderately active", "Highly active")),
vitd_deficient = factor(vitd_deficient, levels = c("Deficient", "Insufficient", "Sufficient")),
low_bmd = factor(low_bmd, levels = c("Normal BMD", "Low BMD"))
)
# Check new variables
cat("=== NEW DERIVED VARIABLES ===\n")## === NEW DERIVED VARIABLES ===
##
## Age Group Distribution:
##
## 50-59 years 60-69 years 70+ years
## 880 1057 961
##
## BMI Category Distribution:
##
## Underweight Normal Overweight Obese
## 37 632 987 1178
##
## Physical Activity Level:
##
## Sedentary Moderately active Highly active
## 996 348 590
##
## Vitamin D Status:
##
## Deficient Insufficient Sufficient
## 2469 93 43
##
## Low BMD Status:
##
## Normal BMD Low BMD
## 2243 43
Data Preparation Summary: The original dataset contained 2898 participants. After excluding those with missing BMD or calcium data, the final analytic sample includes 2898 participants. This complete-case analysis approach ensures all participants in the regression have data for both the outcome (BMD) and primary predictor (calcium).
##STEP 5: Missing Data Analysis
missing_summary <- bmd_analysis %>%
summarise(across(everything(),
~sum(is.na(.)),
.names = "missing_{.col}")) %>%
pivot_longer(everything(),
names_to = "variable",
values_to = "n_missing") %>%
mutate(
variable = gsub("missing_", "", variable),
n_total = nrow(bmd_analysis),
percent_missing = round((n_missing / n_total) * 100, 2)
) %>%
filter(percent_missing > 0) %>%
arrange(desc(percent_missing))
cat("=== MISSING DATA SUMMARY ===\n")## === MISSING DATA SUMMARY ===
## # A tibble: 11 × 4
## variable n_missing n_total percent_missing
## <chr> <int> <int> <dbl>
## 1 DSQTCALC 1633 2898 56.4
## 2 DSQTVD 1515 2898 52.3
## 3 totmet 964 2898 33.3
## 4 pa_level 964 2898 33.3
## 5 DXXOFBMD 612 2898 21.1
## 6 low_bmd 612 2898 21.1
## 7 calcium 293 2898 10.1
## 8 vitd 293 2898 10.1
## 9 vitd_deficient 293 2898 10.1
## 10 BMXBMI 64 2898 2.21
## 11 bmi_category 64 2898 2.21
# Visualize missing data
missing_plot <- missing_summary %>%
ggplot(aes(x = reorder(variable, percent_missing), y = percent_missing)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Figure 1. Missing Data by Variable",
x = "Variable",
y = "Percent Missing (%)",
caption = paste0("Total N = ", nrow(bmd_analysis))) +
theme_minimal()
print(missing_plot)##STEP 6: Descriptive Statistics - Table 1
library(gtsummary)
# Create Table 1 stratified by low BMD status
table1 <- bmd_analysis %>%
select(
`Age (years)` = RIDAGEYR,
`Age Group` = age_group,
`Gender` = RIAGENDR,
`Race/Ethnicity` = RIDRETH1,
`BMI (kg/m2)` = BMXBMI,
`BMI Category` = bmi_category,
`Physical Activity (MET-min)` = totmet,
`PA Level` = pa_level,
`BMD (g/cm2)` = DXXOFBMD,
`Low BMD` = low_bmd,
`Calcium` = calcium,
`Vitamin D` = vitd,
`Vitamin D Status` = vitd_deficient
) %>%
tbl_summary(
by = `Low BMD`,
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 1,
missing_text = "Missing"
) %>%
add_overall() %>%
add_p() %>%
modify_header(
label = "**Characteristic**",
all_stat_cols() ~ "**{level}** \nN = {n} ({p}%)"
) %>%
modify_caption("**Table 1. Participant Characteristics by Low BMD Status**") %>%
bold_labels()## 612 missing rows in the "Low BMD" column have been removed.
## The following errors were returned during `modify_caption()`:
## ✖ For variable `Age (years)` (`Low BMD`) and "estimate", "std.error",
## "parameter", "statistic", "conf.low", "conf.high", and "p.value" statistics:
## namespace 'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is required
## ✖ For variable `Age Group` (`Low BMD`) and "estimate", "std.error",
## "parameter", "statistic", "conf.low", "conf.high", and "p.value" statistics:
## namespace 'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is required
## ✖ For variable `BMD (g/cm2)` (`Low BMD`) and "estimate", "std.error",
## "parameter", "statistic", "conf.low", "conf.high", and "p.value" statistics:
## namespace 'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is required
## ✖ For variable `BMI (kg/m2)` (`Low BMD`) and "estimate", "std.error",
## "parameter", "statistic", "conf.low", "conf.high", and "p.value" statistics:
## namespace 'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is required
## ✖ For variable `BMI Category` (`Low BMD`) and "estimate", "std.error",
## "parameter", "statistic", "conf.low", "conf.high", and "p.value" statistics:
## namespace 'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is required
## ✖ For variable `Calcium` (`Low BMD`) and "estimate", "std.error", "parameter",
## "statistic", "conf.low", "conf.high", and "p.value" statistics: namespace
## 'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is required
## ✖ For variable `Gender` (`Low BMD`) and "estimate", "std.error", "parameter",
## "statistic", "conf.low", "conf.high", and "p.value" statistics: namespace
## 'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is required
## ✖ For variable `PA Level` (`Low BMD`) and "estimate", "std.error", "parameter",
## "statistic", "conf.low", "conf.high", and "p.value" statistics: namespace
## 'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is required
## ✖ For variable `Physical Activity (MET-min)` (`Low BMD`) and "estimate",
## "std.error", "parameter", "statistic", "conf.low", "conf.high", and "p.value"
## statistics: namespace 'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is
## required
## ✖ For variable `Race/Ethnicity` (`Low BMD`) and "estimate", "std.error",
## "parameter", "statistic", "conf.low", "conf.high", and "p.value" statistics:
## namespace 'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is required
## ✖ For variable `Vitamin D` (`Low BMD`) and "estimate", "std.error",
## "parameter", "statistic", "conf.low", "conf.high", and "p.value" statistics:
## namespace 'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is required
## ✖ For variable `Vitamin D Status` (`Low BMD`) and "estimate", "std.error",
## "parameter", "statistic", "conf.low", "conf.high", and "p.value" statistics:
## namespace 'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is required
| Characteristic | Overall N = 2286 (1%)1 |
Normal BMD N = 2243 (0.981189851268591%)1 |
Low BMD N = 43 (0.0188101487314086%)1 |
p-value |
|---|---|---|---|---|
| Age (years) | 64.7 (9.0) | 64.5 (9.0) | 74.1 (7.5) | |
| Age Group | ||||
| 50-59 years | 718 (31%) | 716 (32%) | 2 (4.7%) | |
| 60-69 years | 866 (38%) | 858 (38%) | 8 (19%) | |
| 70+ years | 702 (31%) | 669 (30%) | 33 (77%) | |
| Gender | ||||
| Male | 1,199 (52%) | 1,196 (53%) | 3 (7.0%) | |
| Female | 1,087 (48%) | 1,047 (47%) | 40 (93%) | |
| Race/Ethnicity | ||||
| Mexican American | 255 (11%) | 252 (11%) | 3 (7.0%) | |
| Other Hispanic | 244 (11%) | 240 (11%) | 4 (9.3%) | |
| Non-Hispanic White | 846 (37%) | 820 (37%) | 26 (60%) | |
| Non-Hispanic Black | 532 (23%) | 528 (24%) | 4 (9.3%) | |
| Other Race | 409 (18%) | 403 (18%) | 6 (14%) | |
| BMI (kg/m2) | 29.0 (6.0) | 29.1 (6.0) | 22.7 (4.8) | |
| Missing | 11 | 11 | 0 | |
| BMI Category | ||||
| Underweight | 28 (1.2%) | 22 (1.0%) | 6 (14%) | |
| Normal | 539 (24%) | 512 (23%) | 27 (63%) | |
| Overweight | 858 (38%) | 852 (38%) | 6 (14%) | |
| Obese | 850 (37%) | 846 (38%) | 4 (9.3%) | |
| Missing | 11 | 11 | 0 | |
| Physical Activity (MET-min) | 1,026.5 (1,250.0) | 1,029.7 (1,254.5) | 786.7 (828.5) | |
| Missing | 698 | 676 | 22 | |
| PA Level | ||||
| Sedentary | 810 (51%) | 798 (51%) | 12 (57%) | |
| Moderately active | 286 (18%) | 283 (18%) | 3 (14%) | |
| Highly active | 492 (31%) | 486 (31%) | 6 (29%) | |
| Missing | 698 | 676 | 22 | |
| BMD (g/cm2) | 0.9 (0.2) | 0.9 (0.2) | 0.5 (0.1) | |
| Calcium | 848.1 (460.2) | 849.9 (461.1) | 752.0 (405.5) | |
| Missing | 157 | 153 | 4 | |
| Vitamin D | 4.6 (5.1) | 4.6 (5.1) | 4.3 (3.5) | |
| Missing | 157 | 153 | 4 | |
| Vitamin D Status | ||||
| Deficient | 2,009 (94%) | 1,971 (94%) | 38 (97%) | |
| Insufficient | 82 (3.9%) | 81 (3.9%) | 1 (2.6%) | |
| Sufficient | 38 (1.8%) | 38 (1.8%) | 0 (0%) | |
| Missing | 157 | 153 | 4 | |
| 1 Mean (SD); n (%) | ||||
# Save table
if(!dir.exists("tables")) dir.create("tables")
table1 %>%
as_gt() %>%
gt::gtsave("tables/bmd_table1.html")##STEP 7: Exploratory Data Analysis - Distributions
p1 <- ggplot(bmd_analysis, aes(x = DXXOFBMD)) +
geom_histogram(aes(y = after_stat(density)),
binwidth = 0.05,
fill = "steelblue",
color = "black",
alpha = 0.7) +
geom_density(color = "darkred", linewidth = 1) +
geom_vline(aes(xintercept = mean(DXXOFBMD, na.rm = TRUE)),
color = "darkgreen",
linetype = "dashed",
linewidth = 1) +
labs(
title = "Figure 2. Distribution of Bone Mineral Density",
x = "BMD (g/cm²)",
y = "Density",
caption = paste0("N = ", nrow(bmd_analysis))
) +
theme_minimal()
print(p1)## Warning: Removed 612 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 612 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 612 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 612 rows containing non-finite outside the scale range
## (`stat_density()`).
# Distribution of Age
p2 <- ggplot(bmd_analysis, aes(x = RIDAGEYR)) +
geom_histogram(binwidth = 2, fill = "steelblue", color = "black", alpha = 0.7) +
labs(
title = "Figure 3. Distribution of Age",
x = "Age (years)",
y = "Count"
) +
theme_minimal()
print(p2)ggsave("figures/age_distribution.png", p2, width = 8, height = 5)
# Distribution of BMI
p3 <- ggplot(bmd_analysis, aes(x = BMXBMI)) +
geom_histogram(binwidth = 2, fill = "steelblue", color = "black", alpha = 0.7) +
labs(
title = "Figure 4. Distribution of BMI",
x = "BMI (kg/m²)",
y = "Count"
) +
theme_minimal()
print(p3)## Warning: Removed 64 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 64 rows containing non-finite outside the scale range
## (`stat_bin()`).
##STEP 8: Bivariate Associations
p4 <- ggplot(bmd_analysis, aes(x = age_group, y = DXXOFBMD, fill = age_group)) +
geom_boxplot(alpha = 0.7) +
labs(
title = "Figure 5. BMD by Age Group",
x = "Age Group",
y = "BMD (g/cm²)"
) +
theme_minimal() +
theme(legend.position = "none")
print(p4)## Warning: Removed 612 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 612 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# BMD by Gender
p5 <- ggplot(bmd_analysis, aes(x = RIAGENDR, y = DXXOFBMD, fill = RIAGENDR)) +
geom_boxplot(alpha = 0.7) +
labs(
title = "Figure 6. BMD by Gender",
x = "Gender",
y = "BMD (g/cm²)"
) +
scale_fill_manual(values = c("Male" = "#2E8B57", "Female" = "#CD5C5C")) +
theme_minimal() +
theme(legend.position = "none")
print(p5)## Warning: Removed 612 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 612 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# BMD by Physical Activity Level
p6 <- ggplot(bmd_analysis, aes(x = pa_level, y = DXXOFBMD, fill = pa_level)) +
geom_boxplot(alpha = 0.7) +
labs(
title = "Figure 7. BMD by Physical Activity Level",
x = "Physical Activity Level",
y = "BMD (g/cm²)"
) +
theme_minimal() +
theme(legend.position = "none")
print(p6)## Warning: Removed 612 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 612 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# BMD by Vitamin D Status
p7 <- ggplot(bmd_analysis, aes(x = vitd_deficient, y = DXXOFBMD, fill = vitd_deficient)) +
geom_boxplot(alpha = 0.7) +
labs(
title = "Figure 8. BMD by Vitamin D Status",
x = "Vitamin D Status",
y = "BMD (g/cm²)"
) +
theme_minimal() +
theme(legend.position = "none")
print(p7)## Warning: Removed 612 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 612 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
Research Question: Is there a linear association between dietary calcium intake (calcium, mg/day) and total femur bone mineral density (DXXOFBMD, g/cm²)?
# Create a scatterplot with a fitted regression line
ggplot(bmd_analysis, aes(x = calcium, y = DXXOFBMD)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", se = FALSE, color = "steelblue") +
labs(
title = "Figure 1. Relationship Between Dietary Calcium and Bone Mineral Density",
x = "Calcium Intake (mg/day)",
y = "Total Femur BMD (g/cm²)"
) +
theme_minimal()## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 769 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 769 rows containing missing values or values outside the scale range
## (`geom_point()`).
Written interpretation (3–5 sentences):
The scatterplot shows a weak positive linear relationship between dietary calcium intake and bone mineral density. As calcium intake increases, BMD tends to increase slightly, but the points are widely scattered around the regression line, suggesting the association is not strong. There do not appear to be nonlinear patterns or extreme outliers that would violate linear regression assumptions. However, the wide dispersion indicates that calcium intake alone explains little of the variation in BMD.
##STEP 9: Correlation Analysis
library(corrplot)
# Select numeric variables for correlation
numeric_vars <- bmd_analysis %>%
select(DXXOFBMD, RIDAGEYR, BMXBMI, totmet, calcium, vitd) %>%
drop_na()
# Calculate correlation matrix
cor_matrix <- cor(numeric_vars)
# Display correlation matrix
cat("=== CORRELATION MATRIX ===\n")## === CORRELATION MATRIX ===
## DXXOFBMD RIDAGEYR BMXBMI totmet calcium vitd
## DXXOFBMD 1.000 -0.202 0.397 0.084 0.090 0.031
## RIDAGEYR -0.202 1.000 -0.076 -0.099 -0.028 0.050
## BMXBMI 0.397 -0.076 1.000 -0.015 -0.027 -0.022
## totmet 0.084 -0.099 -0.015 1.000 0.084 0.010
## calcium 0.090 -0.028 -0.027 0.084 1.000 0.485
## vitd 0.031 0.050 -0.022 0.010 0.485 1.000
# Visualize correlations
png("figures/bmd_correlation_matrix.png", width = 8, height = 8, units = "in", res = 300)
corrplot(cor_matrix,
method = "color",
type = "upper",
order = "hclust",
tl.col = "black",
tl.srt = 45,
addCoef.col = "black",
number.cex = 0.8,
title = "Figure 9. Correlation Matrix of Key Variables",
mar = c(0, 0, 2, 0))
dev.off()## png
## 2
# Display the plot in R
corrplot(cor_matrix,
method = "color",
type = "upper",
order = "hclust",
tl.col = "black",
tl.srt = 45,
addCoef.col = "black",
number.cex = 0.8,
title = "Figure 9. Correlation Matrix of Key Variables")# Fit the simple linear regression model (calcium as predictor)
model_calcium <- lm(DXXOFBMD ~ calcium, data = bmd_analysis)
# Display the full model summary
summary(model_calcium)##
## Call:
## lm(formula = DXXOFBMD ~ calcium, data = bmd_analysis)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5565 -0.1057 -0.0056 0.1072 0.6262
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.99e-01 7.19e-03 125.04 < 2e-16 ***
## calcium 3.08e-05 7.45e-06 4.13 3.8e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.158 on 2127 degrees of freedom
## (769 observations deleted due to missingness)
## Multiple R-squared: 0.00796, Adjusted R-squared: 0.00749
## F-statistic: 17.1 on 1 and 2127 DF, p-value: 3.75e-05
A. Intercept (β₀):
The intercept (β₀ = 0.899 g/cm²) represents the predicted mean BMD for an individual with zero calcium intake. While mathematically defined, this value is not clinically meaningful in context because zero calcium intake is impossible (and would be incompatible with life). The intercept serves primarily as a mathematical anchor for the regression line rather than a clinically interpretable parameter. It provides the baseline from which the effect of calcium intake is measured.
B. Slope (β₁):
The slope (β₁ = 3.08 × 10⁻⁵ g/cm² per mg/day) indicates that for every 1 mg/day increase in dietary calcium, BMD increases by approximately 0.00003 g/cm². To make this more interpretable, a 500 mg/day increase (roughly two glasses of milk) predicts an increase of 0.015 g/cm² in BMD. This positive association aligns with biological plausibility, though the effect size is quite small given that typical calcium intakes range from 500-1500 mg/day. The p-value (3.75 × 10⁻⁵) indicates this association is statistically significant.
## 2.5 % 97.5 %
## (Intercept) 8.85e-01 9.13e-01
## calcium 1.62e-05 4.54e-05
State your hypotheses:
Report the test results:
The t-statistic for calcium is 4.131 with 2127 degrees of freedom, and p-value = 3.75 × 10⁻⁵. Since p < 0.05, we reject the null hypothesis. This provides strong statistical evidence of an association between calcium intake and BMD.
Interpret the 95% confidence interval for β₁:
We are 95% confident that the true population slope lies between 1.62 × 10⁻⁵ and 4.54 × 10⁻⁵ g/cm² per mg/day. This means that for each 1 mg/day increase in calcium, BMD increases by 0.000016 to 0.000045 g/cm² in the population. In more interpretable terms, a 500 mg/day increase in calcium (roughly two glasses of milk) is associated with a BMD increase between 0.008 and 0.023 g/cm². Since the entire interval is positive (does not include zero), this confirms the positive direction of the association and the statistical significance.
R² (coefficient of determination):
The R² value of 0.008 indicates that only 0.8% of the variance in BMD is explained by calcium intake alone. This very low value suggests the model fits poorly and that many other factors (age, gender, physical activity, vitamin D) likely influence BMD.
Residual Standard Error (RSE):
The RSE of 0.158 g/cm² represents the typical distance between observed BMD values and the regression line. Given that mean BMD in the sample is 0.92 g/cm², this prediction error is substantial relative to the outcome scale.
# Create a new data frame with the target predictor value
new_data <- data.frame(calcium = 1000)
# Fit the simple linear regression model (calcium as predictor)
model_calcium <- lm(DXXOFBMD ~ calcium, data = bmd_analysis)
# 95% confidence interval for the mean response at calcium = 1000
conf_int <- predict(model_calcium, newdata = new_data, interval = "confidence")
# 95% prediction interval for a new individual at calcium = 1000
pred_int <- predict(model_calcium, newdata = new_data, interval = "prediction")
# Display results
cat("Predicted BMD at 1000 mg/day calcium:\n")## Predicted BMD at 1000 mg/day calcium:
## fit lwr upr
## 1 0.93 0.923 0.937
## fit lwr upr
## 1 0.93 0.62 1.24
Written interpretation (3–6 sentences):
Predicted BMD at 1,000 mg/day calcium: The model predicts a mean BMD of 0.930 g/cm² for individuals consuming 1,000 mg/day of calcium.
What does the 95% confidence interval represent? The 95% confidence interval (0.923 to 0.937 g/cm²) estimates the mean BMD for the entire population of people who consume 1,000 mg/day of calcium. We are 95% confident that the true population mean BMD falls within this narrow range. The narrow width reflects precise estimation of the population mean.
What is the 95% prediction interval, and why is it wider? The 95% prediction interval (0.620 to 1.240 g/cm²) predicts BMD for a single new individual consuming 1,000 mg/day of calcium. It is substantially wider than the confidence interval because it accounts for two sources of uncertainty: (1) uncertainty in estimating the population mean, and (2) individual variation around that mean (the residual standard error of 0.158 g/cm²). This reflects the fact that individuals vary considerably in their BMD even at the same calcium intake level.
Is 1,000 mg/day a meaningful value to predict? Yes, 1,000 mg/day is a clinically meaningful value to predict. It falls well within the observed calcium range in the data (approximately 0 to 4,000 mg/day) and represents the typical recommended daily intake for adults. Predicting at this value provides useful information about expected BMD at a common intake level.
##STEP 11: Model Diagnostics
# Residual plots for full model
par(mfrow = c(2, 2))
plot(model4)
par(mfrow = c(1, 1))
# Check for multicollinearity
library(car)
vif_values <- vif(model4)
cat("=== VARIANCE INFLATION FACTOR (VIF) ===\n")
print(vif_values)##STEP 12: Interaction Analysis
# Check if age effect differs by gender
model_interaction <- lm(DXXOFBMD ~ RIDAGEYR * RIAGENDR + BMXBMI + totmet + vitd,
data = bmd_analysis)
cat("=== INTERACTION MODEL: Age * Gender ===\n")
print(summary(model_interaction))
# Compare with main effects model
anova(model4, model_interaction)
# Visualize interaction
interaction_plot <- bmd_analysis %>%
ggplot(aes(x = RIDAGEYR, y = DXXOFBMD, color = RIAGENDR)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", se = TRUE) +
labs(
title = "Figure 10. BMD by Age and Gender",
x = "Age (years)",
y = "BMD (g/cm²)",
color = "Gender"
) +
theme_minimal()
print(interaction_plot)
ggsave("figures/bmd_age_gender_interaction.png", interaction_plot, width = 8, height = 6)##STEP 13: Save Cleaned Data and Workspace
# Save cleaned dataset
saveRDS(bmd_analysis, "bmd_cleaned.rds")
write.csv(bmd_analysis, "tables/bmd_cleaned.csv", row.names = FALSE)
# Save workspace
save(list = ls(), file = "bmd_analysis_workspace.RData")
# List all created files
cat("\n=== FILES CREATED ===\n")##
## === FILES CREATED ===
##
## Figures in 'figures' directory:
## [1] "age_distribution.png" "bmd_by_age.png"
## [3] "bmd_by_gender.png" "bmd_by_pa.png"
## [5] "bmd_by_vitd.png" "bmd_correlation_matrix.png"
## [7] "bmd_distribution.png" "bmd_missing_data.png"
## [9] "bmi_distribution.png"
##
## Tables in 'tables' directory:
## [1] "bmd_cleaned.csv" "bmd_table1.html"
##
## ✅ BMD ANALYSIS COMPLETE!
## Total participants: 2898
#STEP 14: Session Information (for reproducibility)
## === SESSION INFORMATION ===
## R version 4.4.2 (2024-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] broom_1.0.12 corrplot_0.95 gtsummary_2.5.0 lubridate_1.9.3
## [5] forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2
## [9] readr_2.1.5 tidyr_1.3.1 tibble_3.2.1 ggplot2_4.0.2
## [13] tidyverse_2.0.0 readxl_1.4.5
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.56 bslib_0.10.0 lattice_0.22-6
## [5] tzdb_0.4.0 vctrs_0.6.5 tools_4.4.2 generics_0.1.4
## [9] pkgconfig_2.0.3 Matrix_1.7-1 RColorBrewer_1.1-3 S7_0.2.1
## [13] gt_1.3.0 lifecycle_1.0.5 compiler_4.4.2 farver_2.1.2
## [17] textshaping_0.4.0 litedown_0.9 htmltools_0.5.8.1 sass_0.4.10
## [21] yaml_2.3.10 pillar_1.11.1 jquerylib_0.1.4 cachem_1.1.0
## [25] nlme_3.1-166 commonmark_2.0.0 tidyselect_1.2.1 digest_0.6.37
## [29] stringi_1.8.4 splines_4.4.2 labeling_0.4.3 fastmap_1.2.0
## [33] grid_4.4.2 cli_3.6.3 magrittr_2.0.3 cards_0.7.1
## [37] utf8_1.2.4 withr_3.0.2 scales_1.4.0 backports_1.5.0
## [41] timechange_0.3.0 rmarkdown_2.30 otel_0.2.0 cellranger_1.1.0
## [45] ragg_1.5.0 hms_1.1.4 evaluate_1.0.5 knitr_1.51
## [49] mgcv_1.9-1 markdown_2.0 rlang_1.1.7 glue_1.8.0
## [53] xml2_1.5.2 rstudioapi_0.18.0 jsonlite_2.0.0 R6_2.6.1
## [57] fs_1.6.6 systemfonts_1.3.1
A. Statistical Insight (6 points)
The regression model reveals a statistically significant but clinically weak positive association between dietary calcium and BMD. While the p-value (p < 0.001) indicates the association is unlikely due to chance, the effect size is small—a 500 mg/day increase (roughly two glasses of milk) predicts only about a 0.02 g/cm² increase in BMD. These results were not surprising given the multifactorial nature of bone health.
Key limitations for causal inference include: - Cross-sectional design: Cannot establish temporality (does calcium affect BMD, or do those with higher BMD consume more calcium?) - Residual confounding: Unmeasured factors like physical activity, sun exposure, genetics, and overall diet quality may influence both calcium intake and BMD - Measurement error: Dietary recall is imperfect
Potential confounders include age (bone density decreases with age, and diet may change), gender (women have lower BMD and different calcium needs), physical activity (stimulates bone formation and may correlate with healthier diets), and vitamin D status (essential for calcium absorption).
B. From ANOVA to Regression (5 points)
Homework 1 used one-way ANOVA to compare mean BMD across categorical ethnic groups, answering the question “Does BMD differ by ethnicity?” Regression with a continuous predictor answers “How does BMD change with each unit increase in calcium intake?”
Regression provides advantages ANOVA cannot: - Quantifies the dose-response relationship (slope) - Handles continuous predictors without categorizing (avoiding information loss) - Provides predictions for any predictor value - Extends easily to multiple predictors
ANOVA is preferred when the exposure is naturally categorical (ethnicity, treatment group), while regression is optimal for continuous exposures like nutrients, age, or BMI.
C. R Programming Growth (4 points)
The most challenging part was understanding how to extract and format
specific values from model objects for inline reporting (e.g., 0).
Working through the broom package functions like
tidy() and glance() helped me understand how R
stores model output. I now feel confident extracting coefficients,
confidence intervals, and fit statistics for reporting in reproducible
documents.