{r knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

Load required packages

library(readxl)      # For reading Excel files

## Warning: package 'readxl' was built under R version 4.4.3

library(tidyverse)   # For data manipulation and plotting

## Warning: package 'ggplot2' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(gtsummary)   # For nice tables

## Warning: package 'gtsummary' was built under R version 4.4.3

library(ggplot2)     # For plots
library(corrplot)    # For correlation plots

## Warning: package 'corrplot' was built under R version 4.4.3

## corrplot 0.95 loaded

library(broom)       # For tidy regression output

## Warning: package 'broom' was built under R version 4.4.3

# Set options
options(digits = 3)
set.seed(553)

# Create output directories
if(!dir.exists("figures")) dir.create("figures")
if(!dir.exists("tables")) dir.create("tables")

Data Loading and Inspect BMD Data

# Load the package
library(readxl)

# Read the Excel file
bmd_data <- read_excel("D:/fizza documents/Epi 553/Assignments/HW02/bmd.xlsx")

# Check the data
head(bmd_data)

## # A tibble: 6 × 14
##    SEQN RIAGENDR RIDAGEYR RIDRETH1 BMXBMI smoker totmet metcat DXXOFBMD  tbmdcat
##   <dbl>    <dbl>    <dbl>    <dbl> <chr>  <chr>  <chr>  <chr>  <chr>     <chr>  
## 1 93705        2       66        4 31.7   2      240    0      1.058000… 0      
## 2 93708        2       66        5 23.7   3      120    0      0.801000… 1      
## 3 93709        2       75        4 38.9   1      720    1      0.88      0      
## 4 93711        1       56        5 21.3   3      840    1      0.850999… 1      
## 5 93713        1       67        3 23.5   1      360    0      0.778000… 1      
## 6 93714        2       54        4 39.9   2      NA     NA     0.993999… 0      
## # ℹ 4 more variables: calcium <chr>, vitd <chr>, DSQTVD <chr>, DSQTCALC <chr>

names(bmd_data)

##  [1] "SEQN"     "RIAGENDR" "RIDAGEYR" "RIDRETH1" "BMXBMI"   "smoker"  
##  [7] "totmet"   "metcat"   "DXXOFBMD" "tbmdcat"  "calcium"  "vitd"    
## [13] "DSQTVD"   "DSQTCALC"

summary(bmd_data)

##       SEQN           RIAGENDR       RIDAGEYR       RIDRETH1   
##  Min.   : 93705   Min.   :1.00   Min.   :50.0   Min.   :1.00  
##  1st Qu.: 95902   1st Qu.:1.00   1st Qu.:58.0   1st Qu.:3.00  
##  Median : 98233   Median :2.00   Median :64.0   Median :3.00  
##  Mean   : 98257   Mean   :1.51   Mean   :65.2   Mean   :3.26  
##  3rd Qu.:100581   3rd Qu.:2.00   3rd Qu.:73.0   3rd Qu.:4.00  
##  Max.   :102952   Max.   :2.00   Max.   :80.0   Max.   :5.00  
##     BMXBMI             smoker             totmet             metcat         
##  Length:2898        Length:2898        Length:2898        Length:2898       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    DXXOFBMD           tbmdcat            calcium              vitd          
##  Length:2898        Length:2898        Length:2898        Length:2898       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     DSQTVD            DSQTCALC        
##  Length:2898        Length:2898       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##

# Quick check
glimpse(bmd_data)

## Rows: 2,898
## Columns: 14
## $ SEQN     <dbl> 93705, 93708, 93709, 93711, 93713, 93714, 93715, 93716, 93721…
## $ RIAGENDR <dbl> 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2…
## $ RIDAGEYR <dbl> 66, 66, 75, 56, 67, 54, 71, 61, 60, 60, 64, 67, 70, 53, 57, 7…
## $ RIDRETH1 <dbl> 4, 5, 4, 5, 3, 4, 5, 5, 1, 3, 3, 1, 5, 4, 2, 3, 2, 4, 4, 3, 3…
## $ BMXBMI   <chr> "31.7", "23.7", "38.9", "21.3", "23.5", "39.9", "22.5", "30.7…
## $ smoker   <chr> "2", "3", "1", "3", "1", "2", "1", "2", "3", "3", "3", "2", "…
## $ totmet   <chr> "240", "120", "720", "840", "360", "NA", "6320", "2400", "NA"…
## $ metcat   <chr> "0", "0", "1", "1", "0", "NA", "2", "2", "NA", "NA", "2", "0"…
## $ DXXOFBMD <chr> "1.0580000000000001", "0.80100000000000005", "0.88", "0.85099…
## $ tbmdcat  <chr> "0", "1", "0", "1", "1", "0", "0", "0", "NA", "1", "0", "0", …
## $ calcium  <chr> "503.5", "473.5", "NA", "1248.5", "660.5", "776", "452", "853…
## $ vitd     <chr> "1.85", "5.85", "NA", "3.85", "2.35", "5.65", "3.75", "4.45",…
## $ DSQTVD   <chr> "20.556999999999999", "25", "NA", "25", "NA", "NA", "NA", "10…
## $ DSQTCALC <chr> "211.67", "820", "NA", "35", "13.33", "NA", "26.67", "1066.67…

# Rename the object
bmd <- bmd_data

##STEP 3: Data Cleaning and Type Conversion

Convert character columns to appropriate types

bmd_clean <- bmd %>%
  mutate(
    # Convert demographic variables
    SEQN = as.numeric(SEQN),
    RIAGENDR = factor(RIAGENDR, levels = c(1, 2), 
                      labels = c("Male", "Female")),
    RIDAGEYR = as.numeric(RIDAGEYR),
    RIDRETH1 = factor(RIDRETH1, 
                      levels = c(1, 2, 3, 4, 5),
                      labels = c("Mexican American", 
                                 "Other Hispanic",
                                 "Non-Hispanic White",
                                 "Non-Hispanic Black",
                                 "Other Race")),
    
    # Convert health variables to numeric
    BMXBMI = as.numeric(BMXBMI),
    totmet = as.numeric(totmet),
    DXXOFBMD = as.numeric(DXXOFBMD),
    
    # Convert categorical variables
    smoker = factor(smoker),
    metcat = factor(metcat),
    tbmdcat = factor(tbmdcat),
    
    # Convert nutrient variables
    calcium = as.numeric(calcium),
    vitd = as.numeric(vitd),
    DSQTVD = as.numeric(DSQTVD),
    DSQTCALC = as.numeric(DSQTCALC)
  )

## Warning: There were 7 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `BMXBMI = as.numeric(BMXBMI)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 6 remaining warnings.

# Check the cleaned data
cat("=== CLEANED DATA STRUCTURE ===\n")

## === CLEANED DATA STRUCTURE ===

glimpse(bmd_clean)

## Rows: 2,898
## Columns: 14
## $ SEQN     <dbl> 93705, 93708, 93709, 93711, 93713, 93714, 93715, 93716, 93721…
## $ RIAGENDR <fct> Female, Female, Female, Male, Male, Female, Male, Male, Femal…
## $ RIDAGEYR <dbl> 66, 66, 75, 56, 67, 54, 71, 61, 60, 60, 64, 67, 70, 53, 57, 7…
## $ RIDRETH1 <fct> Non-Hispanic Black, Other Race, Non-Hispanic Black, Other Rac…
## $ BMXBMI   <dbl> 31.7, 23.7, 38.9, 21.3, 23.5, 39.9, 22.5, 30.7, 35.9, 23.8, 2…
## $ smoker   <fct> 2, 3, 1, 3, 1, 2, 1, 2, 3, 3, 3, 2, 2, 3, 3, 2, 3, 1, 2, 1, 1…
## $ totmet   <dbl> 240, 120, 720, 840, 360, NA, 6320, 2400, NA, NA, 1680, 240, 4…
## $ metcat   <fct> 0, 0, 1, 1, 0, NA, 2, 2, NA, NA, 2, 0, 0, 0, 1, NA, 0, NA, 1,…
## $ DXXOFBMD <dbl> 1.058, 0.801, 0.880, 0.851, 0.778, 0.994, 0.952, 1.121, NA, 0…
## $ tbmdcat  <fct> 0, 1, 0, 1, 1, 0, 0, 0, NA, 1, 0, 0, 1, 0, 0, 1, NA, NA, 0, N…
## $ calcium  <dbl> 504, 474, NA, 1248, 660, 776, 452, 854, 929, 1184, 752, 570, …
## $ vitd     <dbl> 1.85, 5.85, NA, 3.85, 2.35, 5.65, 3.75, 4.45, 6.05, 6.45, 3.3…
## $ DSQTVD   <dbl> 20.6, 25.0, NA, 25.0, NA, NA, NA, 100.0, 50.0, 46.7, 125.0, 2…
## $ DSQTCALC <dbl> 211.7, 820.0, NA, 35.0, 13.3, NA, 26.7, 1066.7, 35.0, 133.3, …

# Summary statistics
cat("\n=== SUMMARY STATISTICS ===\n")

## 
## === SUMMARY STATISTICS ===

summary(bmd_clean)

##       SEQN          RIAGENDR       RIDAGEYR                  RIDRETH1   
##  Min.   : 93705   Male  :1431   Min.   :50.0   Mexican American  : 331  
##  1st Qu.: 95902   Female:1467   1st Qu.:58.0   Other Hispanic    : 286  
##  Median : 98233                 Median :64.0   Non-Hispanic White:1086  
##  Mean   : 98257                 Mean   :65.2   Non-Hispanic Black: 696  
##  3rd Qu.:100581                 3rd Qu.:73.0   Other Race        : 499  
##  Max.   :102952                 Max.   :80.0                            
##                                                                         
##      BMXBMI     smoker        totmet     metcat      DXXOFBMD   tbmdcat  
##  Min.   :14.2   1 : 449   Min.   :   0   0 :996   Min.   :0     0 :1445  
##  1st Qu.:25.2   2 : 894   1st Qu.: 240   1 :513   1st Qu.:1     1 : 771  
##  Median :28.7   3 :1553   Median : 480   2 :425   Median :1     2 :  70  
##  Mean   :29.8   NA:   2   Mean   :1015   NA:964   Mean   :1     NA: 612  
##  3rd Qu.:33.4             3rd Qu.:1430            3rd Qu.:1              
##  Max.   :84.4             Max.   :9120            Max.   :2              
##  NA's   :64               NA's   :964             NA's   :612            
##     calcium          vitd          DSQTVD        DSQTCALC   
##  Min.   :  55   Min.   : 0.0   Min.   :   0   Min.   :   1  
##  1st Qu.: 525   1st Qu.: 1.8   1st Qu.:  16   1st Qu.:  93  
##  Median : 773   Median : 3.2   Median :  25   Median : 212  
##  Mean   : 840   Mean   : 4.5   Mean   :  50   Mean   : 358  
##  3rd Qu.:1049   3rd Qu.: 5.5   3rd Qu.:  50   3rd Qu.: 500  
##  Max.   :5199   Max.   :57.9   Max.   :2570   Max.   :3220  
##  NA's   :293    NA's   :293    NA's   :1515   NA's   :1633

##STEP 4: Create Derived Variables

Add derived variables for analysis

bmd_analysis <- bmd_clean %>%
  mutate(
    # Age groups
    age_group = case_when(
      RIDAGEYR >= 50 & RIDAGEYR < 60 ~ "50-59 years",
      RIDAGEYR >= 60 & RIDAGEYR < 70 ~ "60-69 years",
      RIDAGEYR >= 70 ~ "70+ years",
      TRUE ~ NA_character_
    ),
    
    # BMI categories
    bmi_category = case_when(
      BMXBMI < 18.5 ~ "Underweight",
      BMXBMI >= 18.5 & BMXBMI < 25 ~ "Normal",
      BMXBMI >= 25 & BMXBMI < 30 ~ "Overweight",
      BMXBMI >= 30 ~ "Obese",
      TRUE ~ NA_character_
    ),
    
    # Physical activity level
    pa_level = case_when(
      totmet < 500 ~ "Sedentary",
      totmet >= 500 & totmet < 1000 ~ "Moderately active",
      totmet >= 1000 ~ "Highly active",
      TRUE ~ NA_character_
    ),
    
    # Vitamin D deficiency
    vitd_deficient = case_when(
      vitd < 12 ~ "Deficient",
      vitd >= 12 & vitd < 20 ~ "Insufficient",
      vitd >= 20 ~ "Sufficient",
      TRUE ~ NA_character_
    ),
    
    # Low BMD (osteoporosis)
    low_bmd = case_when(
      DXXOFBMD < 0.6 ~ "Low BMD",
      DXXOFBMD >= 0.6 ~ "Normal BMD",
      TRUE ~ NA_character_
    )
  ) %>%
  # Convert new variables to factors
  mutate(
    age_group = factor(age_group, levels = c("50-59 years", "60-69 years", "70+ years")),
    bmi_category = factor(bmi_category, levels = c("Underweight", "Normal", "Overweight", "Obese")),
    pa_level = factor(pa_level, levels = c("Sedentary", "Moderately active", "Highly active")),
    vitd_deficient = factor(vitd_deficient, levels = c("Deficient", "Insufficient", "Sufficient")),
    low_bmd = factor(low_bmd, levels = c("Normal BMD", "Low BMD"))
  )

# Check new variables
cat("=== NEW DERIVED VARIABLES ===\n")

## === NEW DERIVED VARIABLES ===

cat("\nAge Group Distribution:\n")

## 
## Age Group Distribution:

print(table(bmd_analysis$age_group))

## 
## 50-59 years 60-69 years   70+ years 
##         880        1057         961

cat("\nBMI Category Distribution:\n")

## 
## BMI Category Distribution:

print(table(bmd_analysis$bmi_category))

## 
## Underweight      Normal  Overweight       Obese 
##          37         632         987        1178

cat("\nPhysical Activity Level:\n")

## 
## Physical Activity Level:

print(table(bmd_analysis$pa_level))

## 
##         Sedentary Moderately active     Highly active 
##               996               348               590

cat("\nVitamin D Status:\n")

## 
## Vitamin D Status:

print(table(bmd_analysis$vitd_deficient))

## 
##    Deficient Insufficient   Sufficient 
##         2469           93           43

cat("\nLow BMD Status:\n")

## 
## Low BMD Status:

print(table(bmd_analysis$low_bmd))

## 
## Normal BMD    Low BMD 
##       2243         43

Data Preparation Summary: The original dataset contained 2898 participants. After excluding those with missing BMD or calcium data, the final analytic sample includes 2898 participants. This complete-case analysis approach ensures all participants in the regression have data for both the outcome (BMD) and primary predictor (calcium).

##STEP 5: Missing Data Analysis

Calculate missingness

missing_summary <- bmd_analysis %>%
  summarise(across(everything(), 
                   ~sum(is.na(.)),
                   .names = "missing_{.col}")) %>%
  pivot_longer(everything(), 
               names_to = "variable", 
               values_to = "n_missing") %>%
  mutate(
    variable = gsub("missing_", "", variable),
    n_total = nrow(bmd_analysis),
    percent_missing = round((n_missing / n_total) * 100, 2)
  ) %>%
  filter(percent_missing > 0) %>%
  arrange(desc(percent_missing))

cat("=== MISSING DATA SUMMARY ===\n")

## === MISSING DATA SUMMARY ===

print(missing_summary)

## # A tibble: 11 × 4
##    variable       n_missing n_total percent_missing
##    <chr>              <int>   <int>           <dbl>
##  1 DSQTCALC            1633    2898           56.4 
##  2 DSQTVD              1515    2898           52.3 
##  3 totmet               964    2898           33.3 
##  4 pa_level             964    2898           33.3 
##  5 DXXOFBMD             612    2898           21.1 
##  6 low_bmd              612    2898           21.1 
##  7 calcium              293    2898           10.1 
##  8 vitd                 293    2898           10.1 
##  9 vitd_deficient       293    2898           10.1 
## 10 BMXBMI                64    2898            2.21
## 11 bmi_category          64    2898            2.21

# Visualize missing data
missing_plot <- missing_summary %>%
  ggplot(aes(x = reorder(variable, percent_missing), y = percent_missing)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Figure 1. Missing Data by Variable",
       x = "Variable",
       y = "Percent Missing (%)",
       caption = paste0("Total N = ", nrow(bmd_analysis))) +
  theme_minimal()

print(missing_plot)

ggsave("figures/bmd_missing_data.png", missing_plot, width = 8, height = 6)

##STEP 6: Descriptive Statistics - Table 1

library(gtsummary)

# Create Table 1 stratified by low BMD status
table1 <- bmd_analysis %>%
  select(
    `Age (years)` = RIDAGEYR,
    `Age Group` = age_group,
    `Gender` = RIAGENDR,
    `Race/Ethnicity` = RIDRETH1,
    `BMI (kg/m2)` = BMXBMI,
    `BMI Category` = bmi_category,
    `Physical Activity (MET-min)` = totmet,
    `PA Level` = pa_level,
    `BMD (g/cm2)` = DXXOFBMD,
    `Low BMD` = low_bmd,
    `Calcium` = calcium,
    `Vitamin D` = vitd,
    `Vitamin D Status` = vitd_deficient
  ) %>%
  tbl_summary(
    by = `Low BMD`,
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1,
    missing_text = "Missing"
  ) %>%
  add_overall() %>%
  add_p() %>%
  modify_header(
    label = "**Characteristic**",
    all_stat_cols() ~ "**{level}**  \nN = {n} ({p}%)"
  ) %>%
  modify_caption("**Table 1. Participant Characteristics by Low BMD Status**") %>%
  bold_labels()

## 612 missing rows in the "Low BMD" column have been removed.
## The following errors were returned during `modify_caption()`:
## ✖ For variable `Age (years)` (`Low BMD`) and "estimate", "std.error",
##   "parameter", "statistic", "conf.low", "conf.high", and "p.value" statistics:
##   namespace 'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is required
## ✖ For variable `Age Group` (`Low BMD`) and "estimate", "std.error",
##   "parameter", "statistic", "conf.low", "conf.high", and "p.value" statistics:
##   namespace 'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is required
## ✖ For variable `BMD (g/cm2)` (`Low BMD`) and "estimate", "std.error",
##   "parameter", "statistic", "conf.low", "conf.high", and "p.value" statistics:
##   namespace 'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is required
## ✖ For variable `BMI (kg/m2)` (`Low BMD`) and "estimate", "std.error",
##   "parameter", "statistic", "conf.low", "conf.high", and "p.value" statistics:
##   namespace 'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is required
## ✖ For variable `BMI Category` (`Low BMD`) and "estimate", "std.error",
##   "parameter", "statistic", "conf.low", "conf.high", and "p.value" statistics:
##   namespace 'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is required
## ✖ For variable `Calcium` (`Low BMD`) and "estimate", "std.error", "parameter",
##   "statistic", "conf.low", "conf.high", and "p.value" statistics: namespace
##   'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is required
## ✖ For variable `Gender` (`Low BMD`) and "estimate", "std.error", "parameter",
##   "statistic", "conf.low", "conf.high", and "p.value" statistics: namespace
##   'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is required
## ✖ For variable `PA Level` (`Low BMD`) and "estimate", "std.error", "parameter",
##   "statistic", "conf.low", "conf.high", and "p.value" statistics: namespace
##   'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is required
## ✖ For variable `Physical Activity (MET-min)` (`Low BMD`) and "estimate",
##   "std.error", "parameter", "statistic", "conf.low", "conf.high", and "p.value"
##   statistics: namespace 'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is
##   required
## ✖ For variable `Race/Ethnicity` (`Low BMD`) and "estimate", "std.error",
##   "parameter", "statistic", "conf.low", "conf.high", and "p.value" statistics:
##   namespace 'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is required
## ✖ For variable `Vitamin D` (`Low BMD`) and "estimate", "std.error",
##   "parameter", "statistic", "conf.low", "conf.high", and "p.value" statistics:
##   namespace 'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is required
## ✖ For variable `Vitamin D Status` (`Low BMD`) and "estimate", "std.error",
##   "parameter", "statistic", "conf.low", "conf.high", and "p.value" statistics:
##   namespace 'dplyr' 1.1.4 is already loaded, but >= 1.2.0 is required

table1

**Table 1. Participant Characteristics by Low BMD Status**
Characteristic	Overall N = 2286 (1%)¹	Normal BMD N = 2243 (0.981189851268591%)¹	Low BMD N = 43 (0.0188101487314086%)¹
Age (years)	64.7 (9.0)	64.5 (9.0)	74.1 (7.5)
Age Group
50-59 years	718 (31%)	716 (32%)	2 (4.7%)
60-69 years	866 (38%)	858 (38%)	8 (19%)
70+ years	702 (31%)	669 (30%)	33 (77%)
Gender
Male	1,199 (52%)	1,196 (53%)	3 (7.0%)
Female	1,087 (48%)	1,047 (47%)	40 (93%)
Race/Ethnicity
Mexican American	255 (11%)	252 (11%)	3 (7.0%)
Other Hispanic	244 (11%)	240 (11%)	4 (9.3%)
Non-Hispanic White	846 (37%)	820 (37%)	26 (60%)
Non-Hispanic Black	532 (23%)	528 (24%)	4 (9.3%)
Other Race	409 (18%)	403 (18%)	6 (14%)
BMI (kg/m2)	29.0 (6.0)	29.1 (6.0)	22.7 (4.8)
Missing	11	11	0
BMI Category
Underweight	28 (1.2%)	22 (1.0%)	6 (14%)
Normal	539 (24%)	512 (23%)	27 (63%)
Overweight	858 (38%)	852 (38%)	6 (14%)
Obese	850 (37%)	846 (38%)	4 (9.3%)
Missing	11	11	0
Physical Activity (MET-min)	1,026.5 (1,250.0)	1,029.7 (1,254.5)	786.7 (828.5)
Missing	698	676	22
PA Level
Sedentary	810 (51%)	798 (51%)	12 (57%)
Moderately active	286 (18%)	283 (18%)	3 (14%)
Highly active	492 (31%)	486 (31%)	6 (29%)
Missing	698	676	22
BMD (g/cm2)	0.9 (0.2)	0.9 (0.2)	0.5 (0.1)
Calcium	848.1 (460.2)	849.9 (461.1)	752.0 (405.5)
Missing	157	153	4
Vitamin D	4.6 (5.1)	4.6 (5.1)	4.3 (3.5)
Missing	157	153	4
Vitamin D Status
Deficient	2,009 (94%)	1,971 (94%)	38 (97%)
Insufficient	82 (3.9%)	81 (3.9%)	1 (2.6%)
Sufficient	38 (1.8%)	38 (1.8%)	0 (0%)
Missing	157	153	4
¹ Mean (SD); n (%)

# Save table
if(!dir.exists("tables")) dir.create("tables")
table1 %>%
  as_gt() %>%
  gt::gtsave("tables/bmd_table1.html")

##STEP 7: Exploratory Data Analysis - Distributions

Distribution of BMD

p1 <- ggplot(bmd_analysis, aes(x = DXXOFBMD)) +
  geom_histogram(aes(y = after_stat(density)), 
                 binwidth = 0.05, 
                 fill = "steelblue", 
                 color = "black", 
                 alpha = 0.7) +
  geom_density(color = "darkred", linewidth = 1) +
  geom_vline(aes(xintercept = mean(DXXOFBMD, na.rm = TRUE)),
             color = "darkgreen", 
             linetype = "dashed", 
             linewidth = 1) +
  labs(
    title = "Figure 2. Distribution of Bone Mineral Density",
    x = "BMD (g/cm²)",
    y = "Density",
    caption = paste0("N = ", nrow(bmd_analysis))
  ) +
  theme_minimal()

print(p1)

## Warning: Removed 612 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 612 rows containing non-finite outside the scale range
## (`stat_density()`).

ggsave("figures/bmd_distribution.png", p1, width = 8, height = 5)

## Warning: Removed 612 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 612 rows containing non-finite outside the scale range
## (`stat_density()`).

# Distribution of Age
p2 <- ggplot(bmd_analysis, aes(x = RIDAGEYR)) +
  geom_histogram(binwidth = 2, fill = "steelblue", color = "black", alpha = 0.7) +
  labs(
    title = "Figure 3. Distribution of Age",
    x = "Age (years)",
    y = "Count"
  ) +
  theme_minimal()

print(p2)

ggsave("figures/age_distribution.png", p2, width = 8, height = 5)

# Distribution of BMI
p3 <- ggplot(bmd_analysis, aes(x = BMXBMI)) +
  geom_histogram(binwidth = 2, fill = "steelblue", color = "black", alpha = 0.7) +
  labs(
    title = "Figure 4. Distribution of BMI",
    x = "BMI (kg/m²)",
    y = "Count"
  ) +
  theme_minimal()

print(p3)

## Warning: Removed 64 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggsave("figures/bmi_distribution.png", p3, width = 8, height = 5)

## Warning: Removed 64 rows containing non-finite outside the scale range
## (`stat_bin()`).

##STEP 8: Bivariate Associations

BMD by Age Group

p4 <- ggplot(bmd_analysis, aes(x = age_group, y = DXXOFBMD, fill = age_group)) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title = "Figure 5. BMD by Age Group",
    x = "Age Group",
    y = "BMD (g/cm²)"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

print(p4)

## Warning: Removed 612 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggsave("figures/bmd_by_age.png", p4, width = 8, height = 6)

## Warning: Removed 612 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# BMD by Gender
p5 <- ggplot(bmd_analysis, aes(x = RIAGENDR, y = DXXOFBMD, fill = RIAGENDR)) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title = "Figure 6. BMD by Gender",
    x = "Gender",
    y = "BMD (g/cm²)"
  ) +
  scale_fill_manual(values = c("Male" = "#2E8B57", "Female" = "#CD5C5C")) +
  theme_minimal() +
  theme(legend.position = "none")

print(p5)

## Warning: Removed 612 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggsave("figures/bmd_by_gender.png", p5, width = 8, height = 6)

## Warning: Removed 612 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# BMD by Physical Activity Level
p6 <- ggplot(bmd_analysis, aes(x = pa_level, y = DXXOFBMD, fill = pa_level)) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title = "Figure 7. BMD by Physical Activity Level",
    x = "Physical Activity Level",
    y = "BMD (g/cm²)"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

print(p6)

## Warning: Removed 612 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggsave("figures/bmd_by_pa.png", p6, width = 8, height = 6)

## Warning: Removed 612 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# BMD by Vitamin D Status
p7 <- ggplot(bmd_analysis, aes(x = vitd_deficient, y = DXXOFBMD, fill = vitd_deficient)) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title = "Figure 8. BMD by Vitamin D Status",
    x = "Vitamin D Status",
    y = "BMD (g/cm²)"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

print(p7)

## Warning: Removed 612 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggsave("figures/bmd_by_vitd.png", p7, width = 8, height = 6)

## Warning: Removed 612 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Part 1: Exploratory Visualization (15 points)

Research Question: Is there a linear association between dietary calcium intake (calcium, mg/day) and total femur bone mineral density (DXXOFBMD, g/cm²)?

# Create a scatterplot with a fitted regression line
ggplot(bmd_analysis, aes(x = calcium, y = DXXOFBMD)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", se = FALSE, color = "steelblue") +
  labs(
    title   = "Figure 1. Relationship Between Dietary Calcium and Bone Mineral Density",
    x       = "Calcium Intake (mg/day)",
    y       = "Total Femur BMD (g/cm²)"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 769 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 769 rows containing missing values or values outside the scale range
## (`geom_point()`).

Written interpretation (3–5 sentences):

The scatterplot shows a weak positive linear relationship between dietary calcium intake and bone mineral density. As calcium intake increases, BMD tends to increase slightly, but the points are widely scattered around the regression line, suggesting the association is not strong. There do not appear to be nonlinear patterns or extreme outliers that would violate linear regression assumptions. However, the wide dispersion indicates that calcium intake alone explains little of the variation in BMD.

##STEP 9: Correlation Analysis

library(corrplot)

# Select numeric variables for correlation
numeric_vars <- bmd_analysis %>%
  select(DXXOFBMD, RIDAGEYR, BMXBMI, totmet, calcium, vitd) %>%
  drop_na()

# Calculate correlation matrix
cor_matrix <- cor(numeric_vars)

# Display correlation matrix
cat("=== CORRELATION MATRIX ===\n")

## === CORRELATION MATRIX ===

print(round(cor_matrix, 3))

##          DXXOFBMD RIDAGEYR BMXBMI totmet calcium   vitd
## DXXOFBMD    1.000   -0.202  0.397  0.084   0.090  0.031
## RIDAGEYR   -0.202    1.000 -0.076 -0.099  -0.028  0.050
## BMXBMI      0.397   -0.076  1.000 -0.015  -0.027 -0.022
## totmet      0.084   -0.099 -0.015  1.000   0.084  0.010
## calcium     0.090   -0.028 -0.027  0.084   1.000  0.485
## vitd        0.031    0.050 -0.022  0.010   0.485  1.000

# Visualize correlations
png("figures/bmd_correlation_matrix.png", width = 8, height = 8, units = "in", res = 300)
corrplot(cor_matrix, 
         method = "color",
         type = "upper",
         order = "hclust",
         tl.col = "black",
         tl.srt = 45,
         addCoef.col = "black",
         number.cex = 0.8,
         title = "Figure 9. Correlation Matrix of Key Variables",
         mar = c(0, 0, 2, 0))
dev.off()

## png 
##   2

# Display the plot in R
corrplot(cor_matrix, 
         method = "color",
         type = "upper",
         order = "hclust",
         tl.col = "black",
         tl.srt = 45,
         addCoef.col = "black",
         number.cex = 0.8,
         title = "Figure 9. Correlation Matrix of Key Variables")

Part 2: Simple Linear Regression (40 points)

Step 1 — Fit the Model (5 points)

# Fit the simple linear regression model (calcium as predictor)
model_calcium <- lm(DXXOFBMD ~ calcium, data = bmd_analysis)

# Display the full model summary
summary(model_calcium)

## 
## Call:
## lm(formula = DXXOFBMD ~ calcium, data = bmd_analysis)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5565 -0.1057 -0.0056  0.1072  0.6262 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 8.99e-01   7.19e-03  125.04  < 2e-16 ***
## calcium     3.08e-05   7.45e-06    4.13  3.8e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.158 on 2127 degrees of freedom
##   (769 observations deleted due to missingness)
## Multiple R-squared:  0.00796,    Adjusted R-squared:  0.00749 
## F-statistic: 17.1 on 1 and 2127 DF,  p-value: 3.75e-05

Step 2 – Interpret the Coefficients (10 points)

A. Intercept (β₀):

The intercept (β₀ = 0.899 g/cm²) represents the predicted mean BMD for an individual with zero calcium intake. While mathematically defined, this value is not clinically meaningful in context because zero calcium intake is impossible (and would be incompatible with life). The intercept serves primarily as a mathematical anchor for the regression line rather than a clinically interpretable parameter. It provides the baseline from which the effect of calcium intake is measured.

B. Slope (β₁):

The slope (β₁ = 3.08 × 10⁻⁵ g/cm² per mg/day) indicates that for every 1 mg/day increase in dietary calcium, BMD increases by approximately 0.00003 g/cm². To make this more interpretable, a 500 mg/day increase (roughly two glasses of milk) predicts an increase of 0.015 g/cm² in BMD. This positive association aligns with biological plausibility, though the effect size is quite small given that typical calcium intakes range from 500-1500 mg/day. The p-value (3.75 × 10⁻⁵) indicates this association is statistically significant.

Step 3 — Statistical Inference (15 points)

# 95% confidence interval for the slope
confint(model_calcium)

##                2.5 %   97.5 %
## (Intercept) 8.85e-01 9.13e-01
## calcium     1.62e-05 4.54e-05

State your hypotheses:

H₀: β₁ = 0 (There is no linear association between calcium intake and BMD)
H₁: β₁ ≠ 0 (There is a linear association between calcium intake and BMD)

Report the test results:

The t-statistic for calcium is 4.131 with 2127 degrees of freedom, and p-value = 3.75 × 10⁻⁵. Since p < 0.05, we reject the null hypothesis. This provides strong statistical evidence of an association between calcium intake and BMD.

Interpret the 95% confidence interval for β₁:

We are 95% confident that the true population slope lies between 1.62 × 10⁻⁵ and 4.54 × 10⁻⁵ g/cm² per mg/day. This means that for each 1 mg/day increase in calcium, BMD increases by 0.000016 to 0.000045 g/cm² in the population. In more interpretable terms, a 500 mg/day increase in calcium (roughly two glasses of milk) is associated with a BMD increase between 0.008 and 0.023 g/cm². Since the entire interval is positive (does not include zero), this confirms the positive direction of the association and the statistical significance.

Step 4 — Model Fit: R² and Residual Standard Error (10 points)

R² (coefficient of determination):

The R² value of 0.008 indicates that only 0.8% of the variance in BMD is explained by calcium intake alone. This very low value suggests the model fits poorly and that many other factors (age, gender, physical activity, vitamin D) likely influence BMD.

Residual Standard Error (RSE):

The RSE of 0.158 g/cm² represents the typical distance between observed BMD values and the regression line. Given that mean BMD in the sample is 0.92 g/cm², this prediction error is substantial relative to the outcome scale.

Part 3: Prediction (20 points)

# Create a new data frame with the target predictor value
new_data <- data.frame(calcium = 1000)

# Fit the simple linear regression model (calcium as predictor)
model_calcium <- lm(DXXOFBMD ~ calcium, data = bmd_analysis)

# 95% confidence interval for the mean response at calcium = 1000
conf_int <- predict(model_calcium, newdata = new_data, interval = "confidence")

# 95% prediction interval for a new individual at calcium = 1000
pred_int <- predict(model_calcium, newdata = new_data, interval = "prediction")

# Display results
cat("Predicted BMD at 1000 mg/day calcium:\n")

## Predicted BMD at 1000 mg/day calcium:

print(conf_int)

##    fit   lwr   upr
## 1 0.93 0.923 0.937

print(pred_int)

##    fit  lwr  upr
## 1 0.93 0.62 1.24

Written interpretation (3–6 sentences):

Predicted BMD at 1,000 mg/day calcium: The model predicts a mean BMD of 0.930 g/cm² for individuals consuming 1,000 mg/day of calcium.
What does the 95% confidence interval represent? The 95% confidence interval (0.923 to 0.937 g/cm²) estimates the mean BMD for the entire population of people who consume 1,000 mg/day of calcium. We are 95% confident that the true population mean BMD falls within this narrow range. The narrow width reflects precise estimation of the population mean.
What is the 95% prediction interval, and why is it wider? The 95% prediction interval (0.620 to 1.240 g/cm²) predicts BMD for a single new individual consuming 1,000 mg/day of calcium. It is substantially wider than the confidence interval because it accounts for two sources of uncertainty: (1) uncertainty in estimating the population mean, and (2) individual variation around that mean (the residual standard error of 0.158 g/cm²). This reflects the fact that individuals vary considerably in their BMD even at the same calcium intake level.
Is 1,000 mg/day a meaningful value to predict? Yes, 1,000 mg/day is a clinically meaningful value to predict. It falls well within the observed calcium range in the data (approximately 0 to 4,000 mg/day) and represents the typical recommended daily intake for adults. Predicting at this value provides useful information about expected BMD at a common intake level.

##STEP 11: Model Diagnostics

# Residual plots for full model
par(mfrow = c(2, 2))
plot(model4)
par(mfrow = c(1, 1))

# Check for multicollinearity
library(car)
vif_values <- vif(model4)
cat("=== VARIANCE INFLATION FACTOR (VIF) ===\n")
print(vif_values)

##STEP 12: Interaction Analysis

# Check if age effect differs by gender
model_interaction <- lm(DXXOFBMD ~ RIDAGEYR * RIAGENDR + BMXBMI + totmet + vitd, 
                        data = bmd_analysis)
cat("=== INTERACTION MODEL: Age * Gender ===\n")
print(summary(model_interaction))

# Compare with main effects model
anova(model4, model_interaction)

# Visualize interaction
interaction_plot <- bmd_analysis %>%
  ggplot(aes(x = RIDAGEYR, y = DXXOFBMD, color = RIAGENDR)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(
    title = "Figure 10. BMD by Age and Gender",
    x = "Age (years)",
    y = "BMD (g/cm²)",
    color = "Gender"
  ) +
  theme_minimal()

print(interaction_plot)
ggsave("figures/bmd_age_gender_interaction.png", interaction_plot, width = 8, height = 6)

##STEP 13: Save Cleaned Data and Workspace

# Save cleaned dataset
saveRDS(bmd_analysis, "bmd_cleaned.rds")
write.csv(bmd_analysis, "tables/bmd_cleaned.csv", row.names = FALSE)

# Save workspace
save(list = ls(), file = "bmd_analysis_workspace.RData")

# List all created files
cat("\n=== FILES CREATED ===\n")

## 
## === FILES CREATED ===

cat("\nFigures in 'figures' directory:\n")

## 
## Figures in 'figures' directory:

print(list.files("figures"))

## [1] "age_distribution.png"       "bmd_by_age.png"            
## [3] "bmd_by_gender.png"          "bmd_by_pa.png"             
## [5] "bmd_by_vitd.png"            "bmd_correlation_matrix.png"
## [7] "bmd_distribution.png"       "bmd_missing_data.png"      
## [9] "bmi_distribution.png"

cat("\nTables in 'tables' directory:\n")

## 
## Tables in 'tables' directory:

print(list.files("tables"))

## [1] "bmd_cleaned.csv" "bmd_table1.html"

cat("\n✅ BMD ANALYSIS COMPLETE!\n")

## 
## ✅ BMD ANALYSIS COMPLETE!

cat("Total participants:", nrow(bmd_analysis), "\n")

## Total participants: 2898

#STEP 14: Session Information (for reproducibility)

cat("=== SESSION INFORMATION ===\n")

## === SESSION INFORMATION ===

sessionInfo()

## R version 4.4.2 (2024-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] broom_1.0.12    corrplot_0.95   gtsummary_2.5.0 lubridate_1.9.3
##  [5] forcats_1.0.0   stringr_1.5.1   dplyr_1.1.4     purrr_1.0.2    
##  [9] readr_2.1.5     tidyr_1.3.1     tibble_3.2.1    ggplot2_4.0.2  
## [13] tidyverse_2.0.0 readxl_1.4.5   
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       xfun_0.56          bslib_0.10.0       lattice_0.22-6    
##  [5] tzdb_0.4.0         vctrs_0.6.5        tools_4.4.2        generics_0.1.4    
##  [9] pkgconfig_2.0.3    Matrix_1.7-1       RColorBrewer_1.1-3 S7_0.2.1          
## [13] gt_1.3.0           lifecycle_1.0.5    compiler_4.4.2     farver_2.1.2      
## [17] textshaping_0.4.0  litedown_0.9       htmltools_0.5.8.1  sass_0.4.10       
## [21] yaml_2.3.10        pillar_1.11.1      jquerylib_0.1.4    cachem_1.1.0      
## [25] nlme_3.1-166       commonmark_2.0.0   tidyselect_1.2.1   digest_0.6.37     
## [29] stringi_1.8.4      splines_4.4.2      labeling_0.4.3     fastmap_1.2.0     
## [33] grid_4.4.2         cli_3.6.3          magrittr_2.0.3     cards_0.7.1       
## [37] utf8_1.2.4         withr_3.0.2        scales_1.4.0       backports_1.5.0   
## [41] timechange_0.3.0   rmarkdown_2.30     otel_0.2.0         cellranger_1.1.0  
## [45] ragg_1.5.0         hms_1.1.4          evaluate_1.0.5     knitr_1.51        
## [49] mgcv_1.9-1         markdown_2.0       rlang_1.1.7        glue_1.8.0        
## [53] xml2_1.5.2         rstudioapi_0.18.0  jsonlite_2.0.0     R6_2.6.1          
## [57] fs_1.6.6           systemfonts_1.3.1

Part 4: Reflection (15 points)

A. Statistical Insight (6 points)

The regression model reveals a statistically significant but clinically weak positive association between dietary calcium and BMD. While the p-value (p < 0.001) indicates the association is unlikely due to chance, the effect size is small—a 500 mg/day increase (roughly two glasses of milk) predicts only about a 0.02 g/cm² increase in BMD. These results were not surprising given the multifactorial nature of bone health.

Key limitations for causal inference include: - Cross-sectional design: Cannot establish temporality (does calcium affect BMD, or do those with higher BMD consume more calcium?) - Residual confounding: Unmeasured factors like physical activity, sun exposure, genetics, and overall diet quality may influence both calcium intake and BMD - Measurement error: Dietary recall is imperfect

Potential confounders include age (bone density decreases with age, and diet may change), gender (women have lower BMD and different calcium needs), physical activity (stimulates bone formation and may correlate with healthier diets), and vitamin D status (essential for calcium absorption).

B. From ANOVA to Regression (5 points)

Homework 1 used one-way ANOVA to compare mean BMD across categorical ethnic groups, answering the question “Does BMD differ by ethnicity?” Regression with a continuous predictor answers “How does BMD change with each unit increase in calcium intake?”

Regression provides advantages ANOVA cannot: - Quantifies the dose-response relationship (slope) - Handles continuous predictors without categorizing (avoiding information loss) - Provides predictions for any predictor value - Extends easily to multiple predictors

ANOVA is preferred when the exposure is naturally categorical (ethnicity, treatment group), while regression is optimal for continuous exposures like nutrients, age, or BMI.

C. R Programming Growth (4 points)

The most challenging part was understanding how to extract and format specific values from model objects for inline reporting (e.g., 0). Working through the broom package functions like tidy() and glance() helped me understand how R stores model output. I now feel confident extracting coefficients, confidence intervals, and fit statistics for reporting in reproducible documents.

EPI 553 — Homework 2: Simple Linear Regression

Fizza Zaheer

March 12, 2026