#Part 0: Data Acquisition and Preparation ***


##Setup and Data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(haven)
library(here)
## here() starts at C:/Users/AA843241/OneDrive - University at Albany - SUNY/Desktop/EPI553/hw03
library(knitr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(broom)
library(ggeffects)
library(gtsummary)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

1 Loading Survey BRFSS 2023 Data

# Load raw HCW_database data
BRFSS_2023 <- read_xpt(
  "C:/Users/AA843241/OneDrive - University at Albany - SUNY/Desktop/EPI553/hw03/LLCP2023.XPT")

# Display dataset dimensions
dim(BRFSS_2023)
## [1] 433323    350

##Select the nine variables listed in the table above. Use select() with backtick notation for names beginning with underscores.

BRFSS_raw <- BRFSS_2023 %>%
  select(PHYSHLTH, MENTHLTH,`_BMI5`,SEXVAR,
         EXERANY2,`_AGEG5YR`,`_INCOMG1`,EDUCA,`_SMOKER3`)

head(BRFSS_raw, 10)
## # A tibble: 10 × 9
##    PHYSHLTH MENTHLTH `_BMI5` SEXVAR EXERANY2 `_AGEG5YR` `_INCOMG1` EDUCA
##       <dbl>    <dbl>   <dbl>  <dbl>    <dbl>      <dbl>      <dbl> <dbl>
##  1       88       88    3047      2        2         13          9     5
##  2       88       88    2856      2        1         13          9     5
##  3        6        2    2231      2        1         13          1     4
##  4        2       88    2744      2        1         12          9     5
##  5       88       88    2585      2        1         12          5     5
##  6        2        3    3018      2        1          9          5     5
##  7        8       77    2441      1        2         13          4     4
##  8        1       88    2727      2        2         12          9     5
##  9        5       88    3347      2        2         13          4     5
## 10       88       88    2296      1        1         12          5     4
## # ℹ 1 more variable: `_SMOKER3` <dbl>


###Select and Recode all variables. Store all categorical variables as labeled factors

brfss_clean <- BRFSS_raw %>%
  mutate(
    # PHYSHLTH:
    physhlth_days = case_when(
      PHYSHLTH == 88                    ~ 0,
      PHYSHLTH == 77                    ~ NA_real_,
      PHYSHLTH == 99                    ~ NA_real_,
      PHYSHLTH >= 1 & PHYSHLTH <= 30   ~ as.numeric(PHYSHLTH),
      TRUE                             ~ NA_real_),
    #MENTHLTH 
    menthlth_days = case_when(
      MENTHLTH == 88                    ~ 0,
      MENTHLTH == 77                    ~ NA_real_,
      MENTHLTH == 99                    ~ NA_real_,
      MENTHLTH >= 1 & MENTHLTH <= 30    ~ as.numeric(MENTHLTH),
      TRUE                             ~ NA_real_),
    
    #BMI
    bmi = case_when(
  `_BMI5` == 9999 ~ NA_real_, TRUE ~ `_BMI5` / 100),
    
     sex = factor(case_when(
      SEXVAR == 1 ~ "Male",
      SEXVAR == 2 ~ "Female",
      TRUE ~ NA_character_), 
      levels = c("Male", "Female")),

#EXERANY2: 1 = “Yes”, 2 = “No”; values 7 and 9 should be recoded to NA.  
    exercise = factor(case_when(
      EXERANY2 == 1 ~ "Yes",
      EXERANY2 == 2 ~ "No",
      EXERANY2 == 7 ~ NA_character_,
      EXERANY2 == 9 ~ NA_character_, 
      TRUE          ~ NA_character_ ), 
      levels = c("No", "Yes")),
    
# 1 = “18-24”, 2 = “25-29”, …, 13 = “80+”; value 14 should be recoded to NA.
    age_group = factor(case_when(
      `_AGEG5YR` == 1  ~ "18-24", 
      `_AGEG5YR` == 2  ~ "25-29", 
      `_AGEG5YR` == 3  ~ "30-34", 
      `_AGEG5YR` == 4  ~ "35-39", 
      `_AGEG5YR` == 5  ~ "40-44",
      `_AGEG5YR` == 6  ~ "45-49", 
      `_AGEG5YR` == 7  ~ "50-54",
      `_AGEG5YR` == 8  ~ "55-59",
      `_AGEG5YR` == 9  ~ "60-64", 
      `_AGEG5YR` == 10 ~ "65-69",
      `_AGEG5YR` == 11 ~ "70-74",
      `_AGEG5YR` == 12 ~ "75-79", 
      `_AGEG5YR` == 13 ~ "80+",
      `_AGEG5YR` == 14 ~ NA_character_,  TRUE ~NA_character_),
      levels = c("18-24", "25-29", "30-34", 
                 "35-39", "40-44", "45-49", 
                 "50-54", "55-59", "60-64", 
                 "65-69", "70-74", "75-79", "80+")),
    
  #_INCOMG1: 1 = “Less than $15,000” through 7 = “$200,000 or more”; value 9 should be recoded to NA.
      income = factor(case_when(
        `_INCOMG1`== 1 ~ "Less than $15,000",
        `_INCOMG1`== 2 ~ "$15,000 to < $25,000",       
        `_INCOMG1`== 3 ~ "$25,000 to < $35,000",
        `_INCOMG1`== 4 ~ "$35,000 to < $50,000",
        `_INCOMG1`== 5 ~ "$50,000 to < $100,000",
        `_INCOMG1`== 6 ~ "$100,000 to < $200,000", 
        `_INCOMG1`== 7 ~  "$200,000 or more",       
        `_INCOMG1`== 9 ~ NA_character_, TRUE ~NA_character_), 
        levels = c("Less than $15,000","$15,000 to < $25,000",
                  "$25,000 to < $35,000","$35,000 to < $50,000",
                  "$50,000 to < $100,000", "$100,000 to < $200,000",
                  "$200,000 or more")),

#EDUCA:  
    education = factor(case_when(
      EDUCA %in% c(1, 2)     ~ "Less than HS",
      EDUCA == 3             ~ "HS diploma or GED",
      EDUCA == 4             ~ "Some college or technical school",
      EDUCA == 5             ~ "College graduate",
      EDUCA == 6             ~ "Graduate or professional degree",
      EDUCA == 9             ~ NA_character_,
      TRUE                   ~ NA_character_), 
      levels = c("Less than HS", "HS diploma or GED", "Some college or technical school", 
                 "College graduate", "Graduate or professional degree")),
      
#_SMOKER3: 1 = “Current daily smoker”, 2 = “Current some-day smoker”, 3 = “Former smoker”, 4 = “Never smoker”; value 9 should be recoded NA
   
   smoking = factor(case_when(
     `_SMOKER3` == 1 ~ "Current daily smoker",
     `_SMOKER3` == 2 ~ "Current some-day smoker",
     `_SMOKER3` == 3 ~ "Former smoker",
     `_SMOKER3` == 4 ~ "Never smoker",
     `_SMOKER3` == 9 ~ NA_character_, TRUE ~ NA_character_),
     levels = c("Current daily smoker","Current some-day smoker","Former smoker", "Never smoker"))
      )
  
  brfss_clean %>%
  select(menthlth_days, physhlth_days, income, sex) %>%
  summarise(across(everything(), list(
    n_missing  = ~ sum(is.na(.)),
    pct_missing = ~ round(mean(is.na(.)) * 100, 1)
  ))) %>%
  pivot_longer(
    everything(),
    names_to      = c("Variable", ".value"),
    names_pattern = "(.+)_(n_missing|pct_missing)"
  ) %>%
  arrange(desc(pct_missing)) %>%
  kable(
    caption   = "Missing Observations BRFSS 2023",
    col.names = c("Variable", "N Missing", "% Missing")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Missing Observations BRFSS 2023
Variable N Missing % Missing
income 86623 20.0
physhlth_days 10785 2.5
menthlth_days 8108 1.9
sex 0 0.0


##Create the analytic dataset & Report the final analytic n.

brfss_clean |>
 filter(
    !is.na(physhlth_days),
    !is.na(menthlth_days),
    !is.na(bmi),
    !is.na(sex),
    !is.na(exercise),
    !is.na(age_group),
    !is.na(income),
    !is.na(education),
    !is.na(smoking) )
## # A tibble: 310,768 × 18
##    PHYSHLTH MENTHLTH `_BMI5` SEXVAR EXERANY2 `_AGEG5YR` `_INCOMG1` EDUCA
##       <dbl>    <dbl>   <dbl>  <dbl>    <dbl>      <dbl>      <dbl> <dbl>
##  1        6        2    2231      2        1         13          1     4
##  2       88       88    2585      2        1         12          5     5
##  3        2        3    3018      2        1          9          5     5
##  4        5       88    3347      2        2         13          4     5
##  5       88       88    2296      1        1         12          5     4
##  6       88       88    4184      1        1          8          5     5
##  7       88       88    2965      2        1         10          6     6
##  8       88       88    2375      1        2         12          6     5
##  9       88       88    2597      2        2         12          6     5
## 10        4       88    2819      2        1         13          2     4
## # ℹ 310,758 more rows
## # ℹ 10 more variables: `_SMOKER3` <dbl>, physhlth_days <dbl>,
## #   menthlth_days <dbl>, bmi <dbl>, sex <fct>, exercise <fct>, age_group <fct>,
## #   income <fct>, education <fct>, smoking <fct>
# Reproducible random sample
set.seed(1220)
brfss_analytic <- brfss_clean |>
  drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
 age_group, income, education, smoking) |>
 slice_sample(n = 8000)

# Save for lab activity
saveRDS(brfss_analytic,
  "C:/Users/AA843241/OneDrive - University at Albany - SUNY/Desktop/EPI553/HW03/brfss_clean.rds")

tibble(Metric = c("Observations", "Variables"),
       Value  = c(nrow(brfss_analytic), ncol(brfss_analytic))) |>
  kable(caption = "Analytic Dataset Dimensions") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Analytic Dataset Dimensions
Metric Value
Observations 8000
Variables 18


##Produce a descriptive statistics table using gtsummary::tbl_summary(), stratified by sex

brfss_analytic |>
  select(menthlth_days, physhlth_days, bmi, sex, exercise,
 age_group, income, education, smoking) |>
  tbl_summary(
    by=sex,
    label = list(
      menthlth_days ~ "Mentally unhealthy days (past 30)",
      physhlth_days ~ "Physically unhealthy days (past 30)",
      bmi           ~ "Body Mass Index",
      exercise      ~ "Any physical activity (past 30 days)",
      age_group     ~ "Age Group",
      income        ~ "Annual household income level",
      education     ~ "Education level",
      smoking    ~ "Smoking status"
    ),
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1,
    missing = "no"
  ) |>
  add_n() |>
  bold_labels() |>
  italicize_levels() |>
  modify_caption("**Table 1. Descriptive Statistics — BRFSS 2020 (n = 5,000)**") |>
  as_flex_table()
**Table 1. Descriptive Statistics — BRFSS 2020 (n = 5,000)**

Characteristic

N

Male
N = 3,9361

Female
N = 4,0641

Mentally unhealthy days (past 30)

8,000

3.5 (7.5)

5.1 (8.6)

Physically unhealthy days (past 30)

8,000

4.0 (8.4)

4.9 (8.9)

Body Mass Index

8,000

28.7 (6.0)

28.7 (7.0)

Any physical activity (past 30 days)

8,000

3,146 (80%)

3,094 (76%)

Age Group

8,000

18-24

235 (6.0%)

171 (4.2%)

25-29

219 (5.6%)

189 (4.7%)

30-34

253 (6.4%)

210 (5.2%)

35-39

263 (6.7%)

302 (7.4%)

40-44

290 (7.4%)

292 (7.2%)

45-49

266 (6.8%)

252 (6.2%)

50-54

305 (7.7%)

303 (7.5%)

55-59

308 (7.8%)

352 (8.7%)

60-64

408 (10%)

379 (9.3%)

65-69

418 (11%)

483 (12%)

70-74

382 (9.7%)

426 (10%)

75-79

325 (8.3%)

338 (8.3%)

80+

264 (6.7%)

367 (9.0%)

Annual household income level

8,000

Less than $15,000

160 (4.1%)

247 (6.1%)

$15,000 to < $25,000

271 (6.9%)

370 (9.1%)

$25,000 to < $35,000

376 (9.6%)

495 (12%)

$35,000 to < $50,000

482 (12%)

585 (14%)

$50,000 to < $100,000

1,251 (32%)

1,260 (31%)

$100,000 to < $200,000

996 (25%)

869 (21%)

$200,000 or more

400 (10%)

238 (5.9%)

Education level

8,000

Less than HS

75 (1.9%)

49 (1.2%)

HS diploma or GED

130 (3.3%)

122 (3.0%)

Some college or technical school

950 (24%)

877 (22%)

College graduate

1,018 (26%)

1,120 (28%)

Graduate or professional degree

1,763 (45%)

1,896 (47%)

Smoking status

8,000

Current daily smoker

339 (8.6%)

319 (7.8%)

Current some-day smoker

151 (3.8%)

117 (2.9%)

Former smoker

1,207 (31%)

1,055 (26%)

Never smoker

2,239 (57%)

2,573 (63%)

1Mean (SD); n (%)



#Part 1: Multiple Linear Regression (25 Points)

Research question: What is the independent association of each predictor with the number of mentally unhealthy days in the past 30 days?


##Step 1:

Fit a multiple linear regression model with menthlth_days as the outcome and the following predictors: physhlth_days, bmi, sex, exercise, age_group, income, education, and smoking. Display the full summary() output.

model_mlr <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise +age_group +
                + income +education + smoking, data=brfss_analytic)
summary(model_mlr)
## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise + 
##     age_group + +income + education + smoking, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.080  -3.865  -1.597   0.712  30.471 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                9.65053    0.95407  10.115  < 2e-16
## physhlth_days                              0.26558    0.01007  26.384  < 2e-16
## bmi                                        0.03338    0.01321   2.527 0.011510
## sexFemale                                  1.39038    0.16750   8.301  < 2e-16
## exerciseYes                               -0.65116    0.21472  -3.033 0.002432
## age_group25-29                            -1.05660    0.51938  -2.034 0.041950
## age_group30-34                            -1.09395    0.50646  -2.160 0.030803
## age_group35-39                            -1.81103    0.48851  -3.707 0.000211
## age_group40-44                            -2.89307    0.48749  -5.935 3.07e-09
## age_group45-49                            -3.05618    0.49769  -6.141 8.61e-10
## age_group50-54                            -3.51674    0.48323  -7.277 3.72e-13
## age_group55-59                            -4.49597    0.47555  -9.454  < 2e-16
## age_group60-64                            -4.52215    0.45848  -9.863  < 2e-16
## age_group65-69                            -5.57795    0.45019 -12.390  < 2e-16
## age_group70-74                            -6.02536    0.45741 -13.173  < 2e-16
## age_group75-79                            -6.28656    0.47484 -13.239  < 2e-16
## age_group80+                              -6.81968    0.47684 -14.302  < 2e-16
## income$15,000 to < $25,000                -1.63703    0.46899  -3.491 0.000485
## income$25,000 to < $35,000                -2.07637    0.44797  -4.635 3.63e-06
## income$35,000 to < $50,000                -2.54685    0.43819  -5.812 6.40e-09
## income$50,000 to < $100,000               -3.05048    0.41069  -7.428 1.22e-13
## income$100,000 to < $200,000              -3.49984    0.42923  -8.154 4.07e-16
## income$200,000 or more                    -3.78409    0.50036  -7.563 4.38e-14
## educationHS diploma or GED                 0.07991    0.81066   0.099 0.921484
## educationSome college or technical school  1.07679    0.68973   1.561 0.118520
## educationCollege graduate                  1.79091    0.69119   2.591 0.009585
## educationGraduate or professional degree   1.73781    0.69250   2.509 0.012111
## smokingCurrent some-day smoker            -1.58670    0.53523  -2.965 0.003041
## smokingFormer smoker                      -1.98971    0.33713  -5.902 3.74e-09
## smokingNever smoker                       -2.93681    0.32162  -9.131  < 2e-16
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexFemale                                 ***
## exerciseYes                               ** 
## age_group25-29                            *  
## age_group30-34                            *  
## age_group35-39                            ***
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000 to < $25,000                ***
## income$25,000 to < $35,000                ***
## income$35,000 to < $50,000                ***
## income$50,000 to < $100,000               ***
## income$100,000 to < $200,000              ***
## income$200,000 or more                    ***
## educationHS diploma or GED                   
## educationSome college or technical school    
## educationCollege graduate                 ** 
## educationGraduate or professional degree  *  
## smokingCurrent some-day smoker            ** 
## smokingFormer smoker                      ***
## smokingNever smoker                       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared:  0.1853, Adjusted R-squared:  0.1824 
## F-statistic: 62.52 on 29 and 7970 DF,  p-value: < 2.2e-16


##Step 2: Write the fitted regression equation including all terms, with coefficients rounded to three decimal places.

\[\widehat{\text{menthlth\_days}} = 9.651 + 0.266\times\text{physhlth\_days} + 0.033\times\text{bmi} + 1.390\times\text{Female} - 0.651\times\text{Ex=Yes} - 1.057\times\text{age25-29} - 1.094\times\text{age30-34} - 1.811\times\text{age35-39} - 2.893\times\text{age40-44} - 3.056\times\text{age45-49} - 3.517\times\text{age50-54} - 4.496\times\text{age55-59} - 4.522\times\text{age60-64} - 5.578\times\text{age65-69} - 6.025\times\text{age70-74} - 6.287\times\text{age75-79} - 6.820\times\text{age80+} - 1.637\times\text{inc2} - 2.076\times\text{inc3} - 2.547\times\text{inc4} - 3.050\times\text{inc5} - 3.500\times\text{inc6} - 3.784\times\text{inc7} + 0.080\times\text{HS} + 1.077\times\text{SomeCol} + 1.791\times\text{CollGrad} + 1.738\times\text{GradProf} - 1.587\times\text{SmkSome} - 1.990\times\text{SmkForm} - 2.937\times\text{SmkNev}\]

Reference categories: Male, No exercise, Age 18–24, Income < $15,000, Less than HS, Current daily smoker.



##Step 3: Interpret the following coefficients in plain language. For each, state the direction, magnitude, and meaning in terms of mentally unhealthy days, holding all other variables constant: physhlth_days (continuous predictor) >For every additional +1 day of Days of poor physical health the days of poor mental health increases by 0.266 on average among participants, holding all other variables constant.

bmi (continuous predictor) > For every additional +1 increase of BMI unit \((kg/m^2)\) the Days of poor mental health increases by 0.033 on average among participants, holding all other variables constant.

sex: Female vs. Male (reference) >In comparison with Male participants, Female participants have 1.390 more days of poor mental health on average, holding all other variables in constant.

• exercise: Yes vs. No (reference) • One age_group coefficient of your choice • Two income coefficients of your choice, compared to the reference category (Less than $15,000) Step 4: Report and interpret each of the following model-level statistics: • R-squared: proportion of variance in mentally unhealthy days explained by all predictors • Adjusted R-squared: how it differs from R-squared and what it adds • Root MSE (residual standard error): average prediction error of the model • Overall F-test: state H0, F-statistic, degrees of freedom (numerator and denominator), p-value, and conclusion