#Part 0: Data Acquisition and Preparation ***
##Setup and Data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(haven)
library(here)
## here() starts at C:/Users/AA843241/OneDrive - University at Albany - SUNY/Desktop/EPI553/hw03
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(broom)
library(ggeffects)
library(gtsummary)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
# Load raw HCW_database data
BRFSS_2023 <- read_xpt(
"C:/Users/AA843241/OneDrive - University at Albany - SUNY/Desktop/EPI553/hw03/LLCP2023.XPT")
# Display dataset dimensions
dim(BRFSS_2023)
## [1] 433323 350
##Select the nine variables listed in the table above. Use select() with backtick notation for names beginning with underscores.
BRFSS_raw <- BRFSS_2023 %>%
select(PHYSHLTH, MENTHLTH,`_BMI5`,SEXVAR,
EXERANY2,`_AGEG5YR`,`_INCOMG1`,EDUCA,`_SMOKER3`)
head(BRFSS_raw, 10)
## # A tibble: 10 × 9
## PHYSHLTH MENTHLTH `_BMI5` SEXVAR EXERANY2 `_AGEG5YR` `_INCOMG1` EDUCA
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 88 88 3047 2 2 13 9 5
## 2 88 88 2856 2 1 13 9 5
## 3 6 2 2231 2 1 13 1 4
## 4 2 88 2744 2 1 12 9 5
## 5 88 88 2585 2 1 12 5 5
## 6 2 3 3018 2 1 9 5 5
## 7 8 77 2441 1 2 13 4 4
## 8 1 88 2727 2 2 12 9 5
## 9 5 88 3347 2 2 13 4 5
## 10 88 88 2296 1 1 12 5 4
## # ℹ 1 more variable: `_SMOKER3` <dbl>
###Select and Recode all variables. Store all categorical variables as labeled factors
brfss_clean <- BRFSS_raw %>%
mutate(
# PHYSHLTH:
physhlth_days = case_when(
PHYSHLTH == 88 ~ 0,
PHYSHLTH == 77 ~ NA_real_,
PHYSHLTH == 99 ~ NA_real_,
PHYSHLTH >= 1 & PHYSHLTH <= 30 ~ as.numeric(PHYSHLTH),
TRUE ~ NA_real_),
#MENTHLTH
menthlth_days = case_when(
MENTHLTH == 88 ~ 0,
MENTHLTH == 77 ~ NA_real_,
MENTHLTH == 99 ~ NA_real_,
MENTHLTH >= 1 & MENTHLTH <= 30 ~ as.numeric(MENTHLTH),
TRUE ~ NA_real_),
#BMI
bmi = case_when(
`_BMI5` == 9999 ~ NA_real_, TRUE ~ `_BMI5` / 100),
sex = factor(case_when(
SEXVAR == 1 ~ "Male",
SEXVAR == 2 ~ "Female",
TRUE ~ NA_character_),
levels = c("Male", "Female")),
#EXERANY2: 1 = “Yes”, 2 = “No”; values 7 and 9 should be recoded to NA.
exercise = factor(case_when(
EXERANY2 == 1 ~ "Yes",
EXERANY2 == 2 ~ "No",
EXERANY2 == 7 ~ NA_character_,
EXERANY2 == 9 ~ NA_character_,
TRUE ~ NA_character_ ),
levels = c("No", "Yes")),
# 1 = “18-24”, 2 = “25-29”, …, 13 = “80+”; value 14 should be recoded to NA.
age_group = factor(case_when(
`_AGEG5YR` == 1 ~ "18-24",
`_AGEG5YR` == 2 ~ "25-29",
`_AGEG5YR` == 3 ~ "30-34",
`_AGEG5YR` == 4 ~ "35-39",
`_AGEG5YR` == 5 ~ "40-44",
`_AGEG5YR` == 6 ~ "45-49",
`_AGEG5YR` == 7 ~ "50-54",
`_AGEG5YR` == 8 ~ "55-59",
`_AGEG5YR` == 9 ~ "60-64",
`_AGEG5YR` == 10 ~ "65-69",
`_AGEG5YR` == 11 ~ "70-74",
`_AGEG5YR` == 12 ~ "75-79",
`_AGEG5YR` == 13 ~ "80+",
`_AGEG5YR` == 14 ~ NA_character_, TRUE ~NA_character_),
levels = c("18-24", "25-29", "30-34",
"35-39", "40-44", "45-49",
"50-54", "55-59", "60-64",
"65-69", "70-74", "75-79", "80+")),
#_INCOMG1: 1 = “Less than $15,000” through 7 = “$200,000 or more”; value 9 should be recoded to NA.
income = factor(case_when(
`_INCOMG1`== 1 ~ "Less than $15,000",
`_INCOMG1`== 2 ~ "$15,000 to < $25,000",
`_INCOMG1`== 3 ~ "$25,000 to < $35,000",
`_INCOMG1`== 4 ~ "$35,000 to < $50,000",
`_INCOMG1`== 5 ~ "$50,000 to < $100,000",
`_INCOMG1`== 6 ~ "$100,000 to < $200,000",
`_INCOMG1`== 7 ~ "$200,000 or more",
`_INCOMG1`== 9 ~ NA_character_, TRUE ~NA_character_),
levels = c("Less than $15,000","$15,000 to < $25,000",
"$25,000 to < $35,000","$35,000 to < $50,000",
"$50,000 to < $100,000", "$100,000 to < $200,000",
"$200,000 or more")),
#EDUCA:
education = factor(case_when(
EDUCA %in% c(1, 2) ~ "Less than HS",
EDUCA == 3 ~ "HS diploma or GED",
EDUCA == 4 ~ "Some college or technical school",
EDUCA == 5 ~ "College graduate",
EDUCA == 6 ~ "Graduate or professional degree",
EDUCA == 9 ~ NA_character_,
TRUE ~ NA_character_),
levels = c("Less than HS", "HS diploma or GED", "Some college or technical school",
"College graduate", "Graduate or professional degree")),
#_SMOKER3: 1 = “Current daily smoker”, 2 = “Current some-day smoker”, 3 = “Former smoker”, 4 = “Never smoker”; value 9 should be recoded NA
smoking = factor(case_when(
`_SMOKER3` == 1 ~ "Current daily smoker",
`_SMOKER3` == 2 ~ "Current some-day smoker",
`_SMOKER3` == 3 ~ "Former smoker",
`_SMOKER3` == 4 ~ "Never smoker",
`_SMOKER3` == 9 ~ NA_character_, TRUE ~ NA_character_),
levels = c("Current daily smoker","Current some-day smoker","Former smoker", "Never smoker"))
)
brfss_clean %>%
select(menthlth_days, physhlth_days, income, sex) %>%
summarise(across(everything(), list(
n_missing = ~ sum(is.na(.)),
pct_missing = ~ round(mean(is.na(.)) * 100, 1)
))) %>%
pivot_longer(
everything(),
names_to = c("Variable", ".value"),
names_pattern = "(.+)_(n_missing|pct_missing)"
) %>%
arrange(desc(pct_missing)) %>%
kable(
caption = "Missing Observations BRFSS 2023",
col.names = c("Variable", "N Missing", "% Missing")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Variable | N Missing | % Missing |
|---|---|---|
| income | 86623 | 20.0 |
| physhlth_days | 10785 | 2.5 |
| menthlth_days | 8108 | 1.9 |
| sex | 0 | 0.0 |
##Create the analytic dataset & Report the final analytic n.
brfss_clean |>
filter(
!is.na(physhlth_days),
!is.na(menthlth_days),
!is.na(bmi),
!is.na(sex),
!is.na(exercise),
!is.na(age_group),
!is.na(income),
!is.na(education),
!is.na(smoking) )
## # A tibble: 310,768 × 18
## PHYSHLTH MENTHLTH `_BMI5` SEXVAR EXERANY2 `_AGEG5YR` `_INCOMG1` EDUCA
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6 2 2231 2 1 13 1 4
## 2 88 88 2585 2 1 12 5 5
## 3 2 3 3018 2 1 9 5 5
## 4 5 88 3347 2 2 13 4 5
## 5 88 88 2296 1 1 12 5 4
## 6 88 88 4184 1 1 8 5 5
## 7 88 88 2965 2 1 10 6 6
## 8 88 88 2375 1 2 12 6 5
## 9 88 88 2597 2 2 12 6 5
## 10 4 88 2819 2 1 13 2 4
## # ℹ 310,758 more rows
## # ℹ 10 more variables: `_SMOKER3` <dbl>, physhlth_days <dbl>,
## # menthlth_days <dbl>, bmi <dbl>, sex <fct>, exercise <fct>, age_group <fct>,
## # income <fct>, education <fct>, smoking <fct>
# Reproducible random sample
set.seed(1220)
brfss_analytic <- brfss_clean |>
drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
age_group, income, education, smoking) |>
slice_sample(n = 8000)
# Save for lab activity
saveRDS(brfss_analytic,
"C:/Users/AA843241/OneDrive - University at Albany - SUNY/Desktop/EPI553/HW03/brfss_clean.rds")
tibble(Metric = c("Observations", "Variables"),
Value = c(nrow(brfss_analytic), ncol(brfss_analytic))) |>
kable(caption = "Analytic Dataset Dimensions") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| Metric | Value |
|---|---|
| Observations | 8000 |
| Variables | 18 |
##Produce a descriptive statistics table using gtsummary::tbl_summary(), stratified by sex
brfss_analytic |>
select(menthlth_days, physhlth_days, bmi, sex, exercise,
age_group, income, education, smoking) |>
tbl_summary(
by=sex,
label = list(
menthlth_days ~ "Mentally unhealthy days (past 30)",
physhlth_days ~ "Physically unhealthy days (past 30)",
bmi ~ "Body Mass Index",
exercise ~ "Any physical activity (past 30 days)",
age_group ~ "Age Group",
income ~ "Annual household income level",
education ~ "Education level",
smoking ~ "Smoking status"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 1,
missing = "no"
) |>
add_n() |>
bold_labels() |>
italicize_levels() |>
modify_caption("**Table 1. Descriptive Statistics — BRFSS 2020 (n = 5,000)**") |>
as_flex_table()
Characteristic | N | Male | Female |
|---|---|---|---|
Mentally unhealthy days (past 30) | 8,000 | 3.5 (7.5) | 5.1 (8.6) |
Physically unhealthy days (past 30) | 8,000 | 4.0 (8.4) | 4.9 (8.9) |
Body Mass Index | 8,000 | 28.7 (6.0) | 28.7 (7.0) |
Any physical activity (past 30 days) | 8,000 | 3,146 (80%) | 3,094 (76%) |
Age Group | 8,000 | ||
18-24 | 235 (6.0%) | 171 (4.2%) | |
25-29 | 219 (5.6%) | 189 (4.7%) | |
30-34 | 253 (6.4%) | 210 (5.2%) | |
35-39 | 263 (6.7%) | 302 (7.4%) | |
40-44 | 290 (7.4%) | 292 (7.2%) | |
45-49 | 266 (6.8%) | 252 (6.2%) | |
50-54 | 305 (7.7%) | 303 (7.5%) | |
55-59 | 308 (7.8%) | 352 (8.7%) | |
60-64 | 408 (10%) | 379 (9.3%) | |
65-69 | 418 (11%) | 483 (12%) | |
70-74 | 382 (9.7%) | 426 (10%) | |
75-79 | 325 (8.3%) | 338 (8.3%) | |
80+ | 264 (6.7%) | 367 (9.0%) | |
Annual household income level | 8,000 | ||
Less than $15,000 | 160 (4.1%) | 247 (6.1%) | |
$15,000 to < $25,000 | 271 (6.9%) | 370 (9.1%) | |
$25,000 to < $35,000 | 376 (9.6%) | 495 (12%) | |
$35,000 to < $50,000 | 482 (12%) | 585 (14%) | |
$50,000 to < $100,000 | 1,251 (32%) | 1,260 (31%) | |
$100,000 to < $200,000 | 996 (25%) | 869 (21%) | |
$200,000 or more | 400 (10%) | 238 (5.9%) | |
Education level | 8,000 | ||
Less than HS | 75 (1.9%) | 49 (1.2%) | |
HS diploma or GED | 130 (3.3%) | 122 (3.0%) | |
Some college or technical school | 950 (24%) | 877 (22%) | |
College graduate | 1,018 (26%) | 1,120 (28%) | |
Graduate or professional degree | 1,763 (45%) | 1,896 (47%) | |
Smoking status | 8,000 | ||
Current daily smoker | 339 (8.6%) | 319 (7.8%) | |
Current some-day smoker | 151 (3.8%) | 117 (2.9%) | |
Former smoker | 1,207 (31%) | 1,055 (26%) | |
Never smoker | 2,239 (57%) | 2,573 (63%) | |
1Mean (SD); n (%) | |||
#Part 1: Multiple Linear Regression (25 Points)
Research question: What is the independent association of each predictor with the number of mentally unhealthy days in the past 30 days?
##Step 1:
Fit a multiple linear regression model with menthlth_days as the outcome and the following predictors: physhlth_days, bmi, sex, exercise, age_group, income, education, and smoking. Display the full summary() output.
model_mlr <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise +age_group +
+ income +education + smoking, data=brfss_analytic)
summary(model_mlr)
##
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise +
## age_group + +income + education + smoking, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.080 -3.865 -1.597 0.712 30.471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.65053 0.95407 10.115 < 2e-16
## physhlth_days 0.26558 0.01007 26.384 < 2e-16
## bmi 0.03338 0.01321 2.527 0.011510
## sexFemale 1.39038 0.16750 8.301 < 2e-16
## exerciseYes -0.65116 0.21472 -3.033 0.002432
## age_group25-29 -1.05660 0.51938 -2.034 0.041950
## age_group30-34 -1.09395 0.50646 -2.160 0.030803
## age_group35-39 -1.81103 0.48851 -3.707 0.000211
## age_group40-44 -2.89307 0.48749 -5.935 3.07e-09
## age_group45-49 -3.05618 0.49769 -6.141 8.61e-10
## age_group50-54 -3.51674 0.48323 -7.277 3.72e-13
## age_group55-59 -4.49597 0.47555 -9.454 < 2e-16
## age_group60-64 -4.52215 0.45848 -9.863 < 2e-16
## age_group65-69 -5.57795 0.45019 -12.390 < 2e-16
## age_group70-74 -6.02536 0.45741 -13.173 < 2e-16
## age_group75-79 -6.28656 0.47484 -13.239 < 2e-16
## age_group80+ -6.81968 0.47684 -14.302 < 2e-16
## income$15,000 to < $25,000 -1.63703 0.46899 -3.491 0.000485
## income$25,000 to < $35,000 -2.07637 0.44797 -4.635 3.63e-06
## income$35,000 to < $50,000 -2.54685 0.43819 -5.812 6.40e-09
## income$50,000 to < $100,000 -3.05048 0.41069 -7.428 1.22e-13
## income$100,000 to < $200,000 -3.49984 0.42923 -8.154 4.07e-16
## income$200,000 or more -3.78409 0.50036 -7.563 4.38e-14
## educationHS diploma or GED 0.07991 0.81066 0.099 0.921484
## educationSome college or technical school 1.07679 0.68973 1.561 0.118520
## educationCollege graduate 1.79091 0.69119 2.591 0.009585
## educationGraduate or professional degree 1.73781 0.69250 2.509 0.012111
## smokingCurrent some-day smoker -1.58670 0.53523 -2.965 0.003041
## smokingFormer smoker -1.98971 0.33713 -5.902 3.74e-09
## smokingNever smoker -2.93681 0.32162 -9.131 < 2e-16
##
## (Intercept) ***
## physhlth_days ***
## bmi *
## sexFemale ***
## exerciseYes **
## age_group25-29 *
## age_group30-34 *
## age_group35-39 ***
## age_group40-44 ***
## age_group45-49 ***
## age_group50-54 ***
## age_group55-59 ***
## age_group60-64 ***
## age_group65-69 ***
## age_group70-74 ***
## age_group75-79 ***
## age_group80+ ***
## income$15,000 to < $25,000 ***
## income$25,000 to < $35,000 ***
## income$35,000 to < $50,000 ***
## income$50,000 to < $100,000 ***
## income$100,000 to < $200,000 ***
## income$200,000 or more ***
## educationHS diploma or GED
## educationSome college or technical school
## educationCollege graduate **
## educationGraduate or professional degree *
## smokingCurrent some-day smoker **
## smokingFormer smoker ***
## smokingNever smoker ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared: 0.1853, Adjusted R-squared: 0.1824
## F-statistic: 62.52 on 29 and 7970 DF, p-value: < 2.2e-16
##Step 2: Write the fitted regression equation including all terms, with coefficients rounded to three decimal places.
\[\widehat{\text{menthlth\_days}} = 9.651 + 0.266\times\text{physhlth\_days} + 0.033\times\text{bmi} + 1.390\times\text{Female} - 0.651\times\text{Ex=Yes} - 1.057\times\text{age25-29} - 1.094\times\text{age30-34} - 1.811\times\text{age35-39} - 2.893\times\text{age40-44} - 3.056\times\text{age45-49} - 3.517\times\text{age50-54} - 4.496\times\text{age55-59} - 4.522\times\text{age60-64} - 5.578\times\text{age65-69} - 6.025\times\text{age70-74} - 6.287\times\text{age75-79} - 6.820\times\text{age80+} - 1.637\times\text{inc2} - 2.076\times\text{inc3} - 2.547\times\text{inc4} - 3.050\times\text{inc5} - 3.500\times\text{inc6} - 3.784\times\text{inc7} + 0.080\times\text{HS} + 1.077\times\text{SomeCol} + 1.791\times\text{CollGrad} + 1.738\times\text{GradProf} - 1.587\times\text{SmkSome} - 1.990\times\text{SmkForm} - 2.937\times\text{SmkNev}\]
Reference categories: Male, No exercise, Age 18–24, Income < $15,000, Less than HS, Current daily smoker.
##Step 3: Interpret the following coefficients in plain language. For each, state the direction, magnitude, and meaning in terms of mentally unhealthy days, holding all other variables constant: physhlth_days (continuous predictor) >For every additional +1 day of Days of poor physical health the days of poor mental health increases by 0.266 on average among participants, holding all other variables constant.
bmi (continuous predictor) > For every additional +1 increase of BMI unit \((kg/m^2)\) the Days of poor mental health increases by 0.033 on average among participants, holding all other variables constant.
sex: Female vs. Male (reference) >In comparison with Male participants, Female participants have 1.390 more days of poor mental health on average, holding all other variables in constant.
• exercise: Yes vs. No (reference) • One age_group coefficient of your choice • Two income coefficients of your choice, compared to the reference category (Less than $15,000) Step 4: Report and interpret each of the following model-level statistics: • R-squared: proportion of variance in mentally unhealthy days explained by all predictors • Adjusted R-squared: how it differs from R-squared and what it adds • Root MSE (residual standard error): average prediction error of the model • Overall F-test: state H0, F-statistic, degrees of freedom (numerator and denominator), p-value, and conclusion