knitr::opts_chunk$set(echo = TRUE)

Part 0 Load required packages: tidyverse, haven, broom, kableExtra, car, gtsummary, ggeffects.

# Required Packgaes
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(haven)
library(broom)
library(kableExtra)

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

library(gtsummary)
library(ggeffects)

# Extra packages
library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(dplyr)
library(tibble)
library(knitr)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

## Setting theme "Compact"
## Setting theme "Compact"

Find LLCP2023.XPT location using file.choose()

file.choose()

## [1] "\001"

Import LLCP2023.XPT using read_xpt()

brfss_raw_ne <- read_xpt("/Users/morganwheat/Desktop/R Project/LLCP2023.XPT ") |>
  clean_names()

tibble(Metric = c("Observations", "Variables"),
       Value  = c(nrow(brfss_raw_ne), ncol(brfss_raw_ne))) |>
  kable(caption = "Raw Dataset Dimensions") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Raw Dataset Dimensions
Metric	Value
Observations	433323
Variables	350

Report the total number of rows and columns in the raw dataset.

There are 433323 observations (rows) and 350 variables (columns) in the raw dataset.

Select the nine variables listed in the table above. Use select()

# Select the nine variables
brfss_select_ne <- brfss_raw_ne |>
  select(menthlth, physhlth, bmi5, sexvar, exerany2, ageg5yr, incomg1, educa, smoker3)

# Display the results
tibble(Metric = c("Observations", "Variables"),
       Value  = c(nrow(brfss_select_ne), ncol(brfss_select_ne))) |>
  kable(caption = "Selected Dataset Dimensions") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Selected Dataset Dimensions
Metric	Value
Observations	433323
Variables	9

Recode all variables as described above. Store all categorical variables as labeled factors.

brfss_recode_ne <- brfss_select_ne |>
  mutate(
    # Mentally unhealthy days (past 30)
     menthlth_days = case_when(
      menthlth == 88              ~ 0,
      menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
      TRUE                        ~ NA_real_
    ),
    # Physically unhealthy days (past 30)
     physhlth_days = case_when(
      physhlth == 88              ~ 0,
      physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
      TRUE                        ~ NA_real_
    ),
    # Body Mass Index x 100 (divide by 100)
     bmi = case_when( bmi5 == 9999 ~ NA_real_, 
     TRUE ~bmi5/100),
    # Sex (1 = Male, 2 = Female)
     sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
    # Any Exercise in the past 30 days (1 = Yes, 2 = No)
    exercise = factor(case_when(
      exerany2 == 1 ~ "Yes",
      exerany2 == 2 ~ "No",
      TRUE          ~ NA_character_
    ), levels = c("No", "Yes")),
    # Age Group In 5-Year Categories (13 levels)
   age_group = factor(case_when(
      ageg5yr == 1 ~ "18-24",
      ageg5yr == 2 ~ "25-29",
      ageg5yr == 3 ~ "30-34",
      ageg5yr == 4 ~ "35-39",
      ageg5yr == 5 ~ "40-44",
      ageg5yr == 6 ~ "45-49",
      ageg5yr == 7 ~ "50-54",
      ageg5yr == 8 ~ "55-59",
      ageg5yr == 9 ~ "60-64",
      ageg5yr == 10 ~ "65-69",
      ageg5yr == 11 ~ "70-74",
      ageg5yr == 12 ~ "75-79",
      ageg5yr == 13 ~ "80+",
      ageg5yr == 14 ~ NA_character_,
      TRUE ~ NA_character_
    ), levels = c("18-24", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80+")),
  # Annual Household Income (7 levels)
  income = factor(case_when(
    incomg1 == 1 ~ "Less than $15,000",
    incomg1 == 2 ~ "$15,000 to $25,000",
    incomg1 == 3 ~ "$25,000 to $35,000",
    incomg1 == 4 ~ "$35,000 to $50,000",
    incomg1 == 5 ~ "$50,000 to $100,000",
    incomg1 == 6 ~ "$100,000 to $200,000",
    incomg1 == 7 ~ "$200,000 or more",
    incomg1 == 9 ~ NA_character_,
  TRUE ~ NA_character_
),
levels = c("Less than $15,000", "$15,000 to $25,000", "$25,000 to $35,000",
           "$35,000 to $50,000", "$50,000 to $100,000",
           "$100,000 to $200,000", "$200,000 or more")),
  # Highest Level of Education Completed
    education = factor(case_when(
      educa %in% c(1,2) ~ "Less than high school",
      educa == 3 ~ "High school diploma or GED",
      educa == 4 ~ "Some college or technical school",
      educa == 5 ~ "College graduate",
      educa == 6 ~ "Graduate or professional degree",
      educa == 9 ~ NA_character_,
      TRUE ~ NA_character_
    ), levels = c("Less than high school", "High school diploma or GED", "Some college or technical school", "College graduate", "Graduate or professional degree")),
  # Smoking Status (4 levels)
  smoking = factor(case_when(
      smoker3 == 1 ~ "Current daily smoker",
      smoker3 == 2 ~ "Current some-day smoker",
      smoker3 == 3 ~ "Former smoker",
      smoker3 == 4 ~ "Never smoker",
      smoker3 == 9 ~ NA_character_,
      TRUE ~ NA_character_
    ), levels = c("Current daily smoker", "Current some-day smoker", "Former smoker", "Never smoker"))
)

# Report the number and percentage of missing observations for menthlth_days and at least two other key variables, before removing any missing values.
missing_summary_ne <- data.frame(
  Variable = names(brfss_recode_ne),
  missing_count = colSums(is.na(brfss_recode_ne)),
  missing_Percent = round(colSums(is.na(brfss_recode_ne)) / nrow(brfss_recode_ne) * 100, 2)
)

missing_summary_sorted <- missing_summary_ne[order(missing_summary_ne$missing_count), ]
print(missing_summary_sorted)

##                    Variable missing_count missing_Percent
## sexvar               sexvar             0            0.00
## ageg5yr             ageg5yr             0            0.00
## incomg1             incomg1             0            0.00
## smoker3             smoker3             0            0.00
## sex                     sex             0            0.00
## exerany2           exerany2             2            0.00
## menthlth           menthlth             3            0.00
## physhlth           physhlth             3            0.00
## educa                 educa             9            0.00
## exercise           exercise          1251            0.29
## education         education          2325            0.54
## age_group         age_group          7779            1.80
## menthlth_days menthlth_days          8108            1.87
## physhlth_days physhlth_days         10785            2.49
## smoking             smoking         23062            5.32
## bmi5                   bmi5         40535            9.35
## bmi                     bmi         40535            9.35
## income               income         86623           19.99

# Create the analytic dataset using drop_na() and slice_sample() with set.seed(1220)
set.seed(1220)
brfss_analytic_ne <- brfss_recode_ne |>
  select(menthlth_days, physhlth_days, bmi, sex, exercise,
         age_group, income, education, smoking) |>
  drop_na() |>
  slice_sample(n = 8000)

tibble(Metric = c("Observations", "Variables"),
       Value  = c(nrow(brfss_analytic_ne), ncol(brfss_analytic_ne))) |>
  kable(caption = "Analytic Dataset Dimensions") |>
  kable_styling()

Analytic Dataset Dimensions
Metric	Value
Observations	8000
Variables	9

Report the final analytic n.

In the final analytic dataset, n = 8,000.

Produce a descriptive statistics table using gtsummary::tbl_summary(), stratified by sex.

brfss_analytic_ne |>
  select(menthlth_days, physhlth_days, bmi, sex, exercise, age_group,
         income, education, smoking) |>
  tbl_summary(
    by = sex,
    label = list(
      menthlth_days  ~ "Mentally unhealthy days (past 30)",
      physhlth_days  ~ "Physically unhealthy days (past 30)",
      bmi            ~ "Body Mass Index",
      exercise       ~ "Exercise in the past 30 days",
      age_group      ~ "Age Group In 5-Year Categories",
      income         ~ "Annual Household Income",
      education      ~ "Highest Level of Education Completed",
      smoking        ~ "Smoking Status"
    ),
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1,
    missing = "no"
  ) |>
  add_n() |>
  bold_labels() |>
  italicize_levels() |>
  modify_caption("**Table 1. Descriptive Statistics — BRFSS 2023 Analytic Sample (n = 8,000)**")

**Table 1. Descriptive Statistics — BRFSS 2023 Analytic Sample (n = 8,000)**
Characteristic	N	Male N = 3,936¹	Female N = 4,064¹
Mentally unhealthy days (past 30)	8,000	3.5 (7.5)	5.1 (8.6)
Physically unhealthy days (past 30)	8,000	4.0 (8.4)	4.9 (8.9)
Body Mass Index	8,000	28.7 (6.0)	28.7 (7.0)
Exercise in the past 30 days	8,000	3,146 (80%)	3,094 (76%)
Age Group In 5-Year Categories	8,000
18-24		235 (6.0%)	171 (4.2%)
25-29		219 (5.6%)	189 (4.7%)
30-34		253 (6.4%)	210 (5.2%)
35-39		263 (6.7%)	302 (7.4%)
40-44		290 (7.4%)	292 (7.2%)
45-49		266 (6.8%)	252 (6.2%)
50-54		305 (7.7%)	303 (7.5%)
55-59		308 (7.8%)	352 (8.7%)
60-64		408 (10%)	379 (9.3%)
65-69		418 (11%)	483 (12%)
70-74		382 (9.7%)	426 (10%)
75-79		325 (8.3%)	338 (8.3%)
80+		264 (6.7%)	367 (9.0%)
Annual Household Income	8,000
Less than $15,000		160 (4.1%)	247 (6.1%)
$15,000 to $25,000		271 (6.9%)	370 (9.1%)
$25,000 to $35,000		376 (9.6%)	495 (12%)
$35,000 to $50,000		482 (12%)	585 (14%)
$50,000 to $100,000		1,251 (32%)	1,260 (31%)
$100,000 to $200,000		996 (25%)	869 (21%)
$200,000 or more		400 (10%)	238 (5.9%)
Highest Level of Education Completed	8,000
Less than high school		75 (1.9%)	49 (1.2%)
High school diploma or GED		130 (3.3%)	122 (3.0%)
Some college or technical school		950 (24%)	877 (22%)
College graduate		1,018 (26%)	1,120 (28%)
Graduate or professional degree		1,763 (45%)	1,896 (47%)
Smoking Status	8,000
Current daily smoker		339 (8.6%)	319 (7.8%)
Current some-day smoker		151 (3.8%)	117 (2.9%)
Former smoker		1,207 (31%)	1,055 (26%)
Never smoker		2,239 (57%)	2,573 (63%)
¹ Mean (SD); n (%)

Part 1 Fit a multiple linear regression model with menthlth_days as the outcome and the following predictors: physhlth_days, bmi, sex, exercise, age_group, income, education, and smoking. Display the full summary() output.

# Fit a multiple linear regression model
mental_model <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + education + smoking, data = brfss_analytic_ne)

# Display the full summary() output
summary(mental_model)

## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise + 
##     age_group + income + education + smoking, data = brfss_analytic_ne)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.080  -3.865  -1.597   0.712  30.471 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                9.65053    0.95407  10.115  < 2e-16
## physhlth_days                              0.26558    0.01007  26.384  < 2e-16
## bmi                                        0.03338    0.01321   2.527 0.011510
## sexFemale                                  1.39038    0.16750   8.301  < 2e-16
## exerciseYes                               -0.65116    0.21472  -3.033 0.002432
## age_group25-29                            -1.05660    0.51938  -2.034 0.041950
## age_group30-34                            -1.09395    0.50646  -2.160 0.030803
## age_group35-39                            -1.81103    0.48851  -3.707 0.000211
## age_group40-44                            -2.89307    0.48749  -5.935 3.07e-09
## age_group45-49                            -3.05618    0.49769  -6.141 8.61e-10
## age_group50-54                            -3.51674    0.48323  -7.277 3.72e-13
## age_group55-59                            -4.49597    0.47555  -9.454  < 2e-16
## age_group60-64                            -4.52215    0.45848  -9.863  < 2e-16
## age_group65-69                            -5.57795    0.45019 -12.390  < 2e-16
## age_group70-74                            -6.02536    0.45741 -13.173  < 2e-16
## age_group75-79                            -6.28656    0.47484 -13.239  < 2e-16
## age_group80+                              -6.81968    0.47684 -14.302  < 2e-16
## income$15,000 to $25,000                  -1.63703    0.46899  -3.491 0.000485
## income$25,000 to $35,000                  -2.07637    0.44797  -4.635 3.63e-06
## income$35,000 to $50,000                  -2.54685    0.43819  -5.812 6.40e-09
## income$50,000 to $100,000                 -3.05048    0.41069  -7.428 1.22e-13
## income$100,000 to $200,000                -3.49984    0.42923  -8.154 4.07e-16
## income$200,000 or more                    -3.78409    0.50036  -7.563 4.38e-14
## educationHigh school diploma or GED        0.07991    0.81066   0.099 0.921484
## educationSome college or technical school  1.07679    0.68973   1.561 0.118520
## educationCollege graduate                  1.79091    0.69119   2.591 0.009585
## educationGraduate or professional degree   1.73781    0.69250   2.509 0.012111
## smokingCurrent some-day smoker            -1.58670    0.53523  -2.965 0.003041
## smokingFormer smoker                      -1.98971    0.33713  -5.902 3.74e-09
## smokingNever smoker                       -2.93681    0.32162  -9.131  < 2e-16
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexFemale                                 ***
## exerciseYes                               ** 
## age_group25-29                            *  
## age_group30-34                            *  
## age_group35-39                            ***
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000 to $25,000                  ***
## income$25,000 to $35,000                  ***
## income$35,000 to $50,000                  ***
## income$50,000 to $100,000                 ***
## income$100,000 to $200,000                ***
## income$200,000 or more                    ***
## educationHigh school diploma or GED          
## educationSome college or technical school    
## educationCollege graduate                 ** 
## educationGraduate or professional degree  *  
## smokingCurrent some-day smoker            ** 
## smokingFormer smoker                      ***
## smokingNever smoker                       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared:  0.1853, Adjusted R-squared:  0.1824 
## F-statistic: 62.52 on 29 and 7970 DF,  p-value: < 2.2e-16

# Organize the output into a table using the tidy() functiion
tidy(mental_model, conf.int = TRUE)  |>
  mutate(
    term = dplyr::recode(term,
      "(Intercept)"    = "Intercept",
      "menthlth_days"  = "Mentally unhealthy days (past 30)",
      "physhlth_days"  = "Physically unhealthy days (past 30)",
      "bmi"            = "Body Mass Index",
      "sex"            = "Sex",
      "exercise"       = "Exercise in the past 30 days",
      "age_group"      = "Age Group In 5-Year Categories",
      "income"         = "Annual Household Income",
      "education"      = "Highest Level of Education Completed",
      "smoking"        = "Smoking Status"
    ),
    across(where(is.numeric), ~ round(., 3))
  )  |>
  kable(
    caption   = "Table 2. Multiple Linear Regression: Mentally Unhealthy Days ~ Multiple Predictors (BRFSS 2023, n = 8,000)",
    col.names = c("Term", "Estimate", "Std. Error", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper"),
    format = "html"
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = TRUE)  |>
  row_spec(0, bold = TRUE)  |>
  row_spec(c(2, 3), background = "#EBF5FB")

Table 2. Multiple Linear Regression: Mentally Unhealthy Days ~ Multiple Predictors (BRFSS 2023, n = 8,000)
Term	Estimate	Std. Error	t-statistic	p-value	95% CI Lower	95% CI Upper
Intercept	9.651	0.954	10.115	0.000	7.780	11.521
Physically unhealthy days (past 30)	0.266	0.010	26.384	0.000	0.246	0.285
Body Mass Index	0.033	0.013	2.527	0.012	0.007	0.059
sexFemale	1.390	0.168	8.301	0.000	1.062	1.719
exerciseYes	-0.651	0.215	-3.033	0.002	-1.072	-0.230
age_group25-29	-1.057	0.519	-2.034	0.042	-2.075	-0.038
age_group30-34	-1.094	0.506	-2.160	0.031	-2.087	-0.101
age_group35-39	-1.811	0.489	-3.707	0.000	-2.769	-0.853
age_group40-44	-2.893	0.487	-5.935	0.000	-3.849	-1.937
age_group45-49	-3.056	0.498	-6.141	0.000	-4.032	-2.081
age_group50-54	-3.517	0.483	-7.277	0.000	-4.464	-2.569
age_group55-59	-4.496	0.476	-9.454	0.000	-5.428	-3.564
age_group60-64	-4.522	0.458	-9.863	0.000	-5.421	-3.623
age_group65-69	-5.578	0.450	-12.390	0.000	-6.460	-4.695
age_group70-74	-6.025	0.457	-13.173	0.000	-6.922	-5.129
age_group75-79	-6.287	0.475	-13.239	0.000	-7.217	-5.356
age_group80+	-6.820	0.477	-14.302	0.000	-7.754	-5.885
income$15,000 to $25,000	-1.637	0.469	-3.491	0.000	-2.556	-0.718
income$25,000 to $35,000	-2.076	0.448	-4.635	0.000	-2.955	-1.198
income$35,000 to $50,000	-2.547	0.438	-5.812	0.000	-3.406	-1.688
income$50,000 to $100,000	-3.050	0.411	-7.428	0.000	-3.856	-2.245
income$100,000 to $200,000	-3.500	0.429	-8.154	0.000	-4.341	-2.658
income$200,000 or more	-3.784	0.500	-7.563	0.000	-4.765	-2.803
educationHigh school diploma or GED	0.080	0.811	0.099	0.921	-1.509	1.669
educationSome college or technical school	1.077	0.690	1.561	0.119	-0.275	2.429
educationCollege graduate	1.791	0.691	2.591	0.010	0.436	3.146
educationGraduate or professional degree	1.738	0.693	2.509	0.012	0.380	3.095
smokingCurrent some-day smoker	-1.587	0.535	-2.965	0.003	-2.636	-0.538
smokingFormer smoker	-1.990	0.337	-5.902	0.000	-2.651	-1.329
smokingNever smoker	-2.937	0.322	-9.131	0.000	-3.567	-2.306

Step #2: Write the fitted regression equation including all terms, with coefficients rounded to three decimal places

Mentally Unhealthy Days = 9.651 + 0.266(Phys. Days) + 0.033(BMI) + 1.390(Sex:Female) + -0.651(Exercise:Yes) + -1.057(Age:25-29) + -1.094(Age:30-34) + -1.811(Age:35-39) + -2.893(Age:40-44) + -3.056(Age:45-49) + -3.517(Age:50-54) + -4.496(Age:55-59) + -4.522(Age:60-64) + -5.578(Age:65-69) + -6.025(Age:70-74) + -6.287(Age:75-79) + -6.820(Age:80+) + -1.637(Income:15,000 to 25,000) + -2.076(Income:25,000 to 35,000) + -2.547(Income:35,000 to 50,000) + -3.050(Income:50,000 to 100,000) + -3.500(Income:100,000 to 200,000) + -3.784(Income:$200,00 or more) + 0.080(Education: High School or GED) + 1.077(Education:Some College or Tech.) + 1.791(Education:College grad.) + 1.738(Education:Grad. or prof. degree) + -1.587(Smoking:Current) + -1.990(Smoking:Former) + -2.937(Smoking:Never)

Step #3: Interpret the following coefficients in plain language. For each, state the direction, magnitude, and meaning in terms of mentally unhealthy days, holding all other variables constant

physhlth_days (continuous predictor)

Physhlth_days(0.266): For every one day increase in physically unhealthy days, there is an associated 0.266 increase in the number of mentally unhealthy days an individual will experience, holding all other variables constant.

bmi (continuous predictor)

bmi(0.033): For every one unit increase in BMI, there is an associated 0.033 increase in the number of mentally unhealthy days an individual will experience, holding all other variables constant.

sex: Female vs. Male (reference)

sex:Female(1.390): Compared to males (the reference group), females report an estimated 1.390 more mentally unhealthy days on average, holding all other variables constant.

exercise: Yes vs. No (reference)

Exercise:Yes (-0.651): Compared to individuals who do not exercise (the reference group), individuals who do exercise report an estimated 0.651 less mentally unhealthy days on average, holding all other variables constant.

One age_group coefficient of your choice

age_group25-29(-1.057): Compared to individuals who are 18-24 years of age (the reference group), individuals who are 25-29 years of age report an estimated 1.057 less mentally unhealthy days on average, holding all other variables constant.

Two income coefficients of your choice, compared to the reference category (Less than $15,000)

income: $15,000 to $25,000 (-1.637):Compared to individuals who have an annual household income less than $15,000 (the reference group), individuals who have a annual household income of $15,000 to $25,000 report an estimated 1.637 less mentally unhealthy days on average, holding all other variables constant.

income:$25,000 to $35,000 (-2.076): Compared to individuals who have an annual household income less than $15,000 (the reference group), individuals who have a annual household income of $25,000 to $35,000 report an estimated 2.076 less mentally unhealthy days on average, holding all other variables constant.

Step 4: Report and interpret each of the following model-level statistics

glance(mental_model)  |>
  select(r.squared, adj.r.squared, sigma, statistic, p.value, df, df.residual, nobs)  |>
  mutate(across(where(is.numeric), ~ round(., 4)))  |>
  pivot_longer(everything(), names_to = "Statistic", values_to = "Value")  |>
  mutate(Statistic = dplyr::recode(Statistic,
    "r.squared"     = "R²",
    "adj.r.squared" = "Adjusted R²",
    "sigma"         = "Residual Std. Error (Root MSE)",
    "statistic"     = "F-statistic",
    "p.value"       = "p-value (overall F-test)",
    "df"            = "Model df (p)",
    "df.residual"   = "Residual df (n − p − 1)",
    "nobs"          = "n (observations)"
  )) %>%
  kable(caption = "Table 3. Overall Model Summary — Mental Model")  |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 3. Overall Model Summary — Mental Model
Statistic	Value
R²	0.1853
Adjusted R²	0.1824
Residual Std. Error (Root MSE)	7.3516
F-statistic	62.5234
p-value (overall F-test)	0.0000
Model df (p)	29.0000
Residual df (n − p − 1)	7970.0000
n (observations)	8000.0000

R-squared: proportion of variance in mentally unhealthy days explained by all predictors

According to the output, R-squared = 0.1853. In words, 18.53% of the variance in mentally unhealthy days is explained by all of the predictors that were included in the model (physhlth_days + bmi + sex + exercise + age_group + income + education + smoking).

Adjusted R-squared: how it differs from R-squared and what it adds

According to the output, the adjusted R-squared = 0.1824. In words, 18.24% of the variance in mentally unhealthy days is explained by all predictors listed in the model (physhlth_days + bmi + sex + exercise + age_group + income + education + smoking), after adjusting for the number of predictors. To explain further, unlike the R-squared value, the adjusted R-squared penalizes for the number of predictors and increases only when a new predictor improves the model beyond chance alone. Essentially, it informs you if the new predictor you added to your model either improves or penalizes it.

Root MSE (residual standard error): average prediction error of the model

According to the output, the Root MSE = 7.35. In words, on average, the mental models prediction of the number of mentally unhealthy days someone may report is off by about 7.35 days.

Overall F-test: state H0, F-statistic, degrees of freedom (numerator and denominator), p-value, and conclusion

Overall F-test (null) = H0:𝛽1=𝛽2=⋯=𝛽𝑝=0(none of the predictors matter) Overall F-test (alternative) = HA:At least one 𝛽𝑗≠0

The F-statistic = 62.5234, the degrees of freedom (numerator) = 29.0000, the degrees of freedom (denominator) = 7970.0000, and the p-value (overall F-test) = 0.0000. In conclusion, since the p-value is less than 0.05, we reject the null hypothesis that none of the predictors in the model matters. This means that at least one of the predictors that were included in the model (physhlth_days + bmi + sex + exercise + age_group + income + education + smoking) is significantly associated with the outcome (mentally unhealthy days).

Part 2 Step 1: Obtain Type III (partial) sums of squares for all predictors using car::Anova(model, type = “III”). Display the full output.

# Type III using car::Anova()
anova_typeIII <- Anova(mental_model, type = "III")

# Display the full output
anova_typeIII  |>
  as.data.frame()  |>
  rownames_to_column("Source") %>%
  mutate(across(where(is.numeric), ~ round(., 4)))  |>
  kable(
    caption = "Table 4. Type III (Partial) Sums of Squares — car::Anova()",
    col.names = c("Source", "Sum of Sq", "df", "F value", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  footnote(general = "Each F-test uses the MSE from the mental model. Each variable is tested given ALL other variables.")

Table 4. Type III (Partial) Sums of Squares — car::Anova()
Source	Sum of Sq	df	F value	p-value
(Intercept)	5529.7722	1	102.3152	0.0000
physhlth_days	37622.7952	1	696.1198	0.0000
bmi	345.2408	1	6.3879	0.0115
sex	3723.8662	1	68.9012	0.0000
exercise	497.0434	1	9.1966	0.0024
age_group	30092.1774	12	46.3986	0.0000
income	4733.8943	6	14.5982	0.0000
education	1265.1504	4	5.8521	0.0001
smoking	5101.1076	3	31.4613	0.0000
Residuals	430750.0872	7970	NA	NA
Note:
Each F-test uses the MSE from the mental model. Each variable is tested given ALL other variables.

Identify which predictors have statistically significant partial associations at α = 0.05.

At α = 0.05, all predictors that were included in the mental model (physhlth_days, bmi, sex, exercise, age_group, income, education, and smoking) have statistically significant partial associations.

Step 2: (Chunk test for income): Test whether income collectively improves the model significantly, after accounting for all other predictors.

Fit a reduced model that includes all predictors except income and a full model.

# Reduced model (excludes income)
m_no_income <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + education + smoking, data = brfss_analytic_ne)

# Full model (includes income)
full <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + education + smoking, data = brfss_analytic_ne)

Use anova(model_reduced, model_full) to compare the two models.

# Compare using anova()
anova(m_no_income, full) |>
  as.data.frame()  |>
  rownames_to_column("Model")  |>
  mutate(
    Model = c("Reduced (no income)", "Full (with income)"),
    across(where(is.numeric), ~ round(., 4))
  ) %>%
  kable(
    caption = "Table 5. Partial F-Test: Does income add to the model?",
    col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 5. Partial F-Test: Does income add to the model?
Model	Res. df	RSS	df	Sum of Sq	F	p-value
Reduced (no income)	7976	435484.0	NA	NA	NA	NA
Full (with income)	7970	430750.1	6	4733.894	14.5982	0

State H0 and HA for this test clearly

𝐻0:𝛽∗=0 𝐻1:𝛽∗≠0

HO: Income does not contribute to the prediction of mentally unhealthy days given that physhlth_days + bmi + sex + exercise + age_group + education + smoking are already in the model.

H1: Income does contribute to the prediction of mentally unhealthy days given that physhlth_days + bmi + sex + exercise + age_group + education + smoking are already in the model.

Report the F-statistic, degrees of freedom, and p-value.

The F-statistic = 14.5982, df = 6, and p-value = 0.

State your conclusion at α = 0.05

At α = 0.05, since the p-value is less than 0.05, we can reject the H0 and conclude that income adds statistically significant information to the prediction of mental health days given that physhlth_days, bmi, sex, exercise, age_group, education, and smoking are already in the model.

Step 3 (Chunk test for education): Repeat the same procedure for education as a group of predictors.

Fit a reduced model that includes all predictors except education and a full model.

# Reduced model (excludes education)
m_no_edu <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + smoking, data = brfss_analytic_ne)

# Full model (includes education)
full <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + education + smoking, data = brfss_analytic_ne)

Use anova(model_reduced, model_full) to compare the two models.

# Compare using anova()
anova(m_no_edu, full) |>
  as.data.frame()  |>
  rownames_to_column("Model")  |>
  mutate(
    Model = c("Reduced (no education)", "Full (with education)"),
    across(where(is.numeric), ~ round(., 4))
  ) %>%
  kable(
    caption = "Table 6. Partial F-Test: Does education add to the model?",
    col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 6. Partial F-Test: Does education add to the model?
Model	Res. df	RSS	df	Sum of Sq	F	p-value
Reduced (no education)	7974	432015.2	NA	NA	NA	NA
Full (with education)	7970	430750.1	4	1265.15	5.8521	1e-04

State H0 and HA for this test clearly

𝐻0:𝛽∗=0 𝐻1:𝛽∗≠0

HO: education does not contribute to the prediction of mentally unhealthy days given that physhlth_days + bmi + sex + exercise + age_group + income + smoking are already in the model.

H1: Education does contribute to the prediction of mentally unhealthy days given that physhlth_days + bmi + sex + exercise + age_group + income + smoking are already in the model.

Report the F-statistic, degrees of freedom, and p-value.

The F-statistic = 5.8521, df = 4, and p-value = 1e-04.

State your conclusion at α = 0.05

At α = 0.05, since the p-value is less than 0.05, we can reject the H0 and conclude that education adds statistically significant information to the prediction of mental health days given that physhlth_days, bmi, sex, exercise, age_group, income, and smoking are already in the model.

Step 4: Write a 3 to 5 sentence synthesis interpreting the Type III results and your chunk test findings. Which predictors make the strongest independent contributions? What do the chunk tests add beyond the individual t-tests from summary()

Based on the findings from the type III results, all predictors that were included in the mental model (physhlth_days, bmi, sex, exercise, age_group, income, education, and smoking) have statistically significant partial associations. Among these, physhlth_days, sex, and age_group have the largest F-statistics, indicating relatively stronger evidence of association after adjusting for other variables. According to the chunk tests, income and education were determined to add statistically significant information to the prediction of mental health days given all other predictors were in the model. Unlike the individual t-tests from summary(), the chunk tests allow you to test whether a group of variables collectively adds to the model rather than comparing one category at a time.

Part 3 Step 1: Create a binary smoking variable called smoker_current (“Current smoker” includes current daily smokers and current some-day smokers) (“Non-smoker” includes former smokers and never smokers. Use “Non-smoker” as the reference category).

# Create a binary smoking variable caled smoker_current
brfss_smoker <- brfss_analytic_ne |>
  mutate(
    smoker_current = factor(
      case_when(
        smoking %in% c("Former smoker", "Never smoker") ~ "Non-Smoker",
        smoking %in% c("Current daily smoker", "Current some-day smoker") ~ "Current Smoker"
      ),
      levels = c("Non-Smoker", "Current Smoker")
    )
  )

Step 2: Fit two models: Model A (main effects only): regress menthlth_days on physhlth_days, bmi, sex, smoker_current, exercise, age_group, income, and education. Model B (with interaction): same as Model A, but use sex * smoker_current in the formula to include the interaction term.

# Model without interaction
model_A <- lm(menthlth_days ~ physhlth_days + bmi + exercise + age_group + income + education + sex + smoker_current, data = brfss_smoker)

# Model with interaction
model_B <- lm(menthlth_days ~ physhlth_days + bmi + exercise + age_group + income + education + sex * smoker_current, data = brfss_smoker)

Step 3: Test whether the interaction is statistically significant:

Use anova(model_A, model_B) to compare the two models.

anova(model_A, model_B) |>
  tidy() |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(caption = "Table 7. Test for Coincidence: Does Smoker Status Affect the Mental Health Relationship with Numerous Predictors at All?") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 7. Test for Coincidence: Does Smoker Status Affect the Mental Health Relationship with Numerous Predictors at All?
term	df.residual	rss	df	sumsq	statistic	p.value
menthlth_days ~ physhlth_days + bmi + exercise + age_group + income + education + sex + smoker_current	7972	432508.9	NA	NA	NA	NA
menthlth_days ~ physhlth_days + bmi + exercise + age_group + income + education + sex * smoker_current	7971	432174.9	1	333.9749	6.1598	0.0131

State H0 and HA for this test.

H0: 𝐻0:𝛽2=𝛽3=0(the two lines are identical in intercept and/or slope)

HA:At least one ≠0(the lines differ in intercept and/or slope)

Report the F-statistic, degrees of freedom, and p-value.

The F-statistic = 6.1598, degrees of freedom = 1, and the p-value = 0.0131

State your conclusion at α = 0.05.

At α = 0.05, we can reject the null hypothesis that the two current smoking status-specific regression lines are identical. This means that current smoking status does significantly modify either the baseline level or the slope of the mental-health and physical health days, bmi, sex, exercise, age_group, income, and education relationship.

Interpret the interaction using the coefficients from Model B:

tidy(model_B, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Table 8. Interaction Model: Physical Health Days + BMI + Exercise + Age Group + Income + Education + Sex * Smoking Status → Mental Health Days",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 8. Interaction Model: Physical Health Days + BMI + Exercise + Age Group + Income + Education + Sex * Smoking Status → Mental Health Days
Term	Estimate	SE	t	p-value	CI Lower	CI Upper
(Intercept)	6.8994	0.9286	7.4301	0.0000	5.0791	8.7196
physhlth_days	0.2686	0.0101	26.6788	0.0000	0.2489	0.2883
bmi	0.0331	0.0132	2.5017	0.0124	0.0072	0.0590
exerciseYes	-0.6728	0.2150	-3.1301	0.0018	-1.0942	-0.2515
age_group25-29	-0.9202	0.5196	-1.7709	0.0766	-1.9388	0.0984
age_group30-34	-0.8924	0.5059	-1.7640	0.0778	-1.8841	0.0993
age_group35-39	-1.5929	0.4875	-3.2673	0.0011	-2.5485	-0.6372
age_group40-44	-2.6286	0.4856	-5.4127	0.0000	-3.5806	-1.6766
age_group45-49	-2.8425	0.4969	-5.7209	0.0000	-3.8165	-1.8686
age_group50-54	-3.2778	0.4820	-6.8012	0.0000	-4.2226	-2.3331
age_group55-59	-4.2499	0.4740	-8.9652	0.0000	-5.1791	-3.3206
age_group60-64	-4.2640	0.4567	-9.3364	0.0000	-5.1593	-3.3688
age_group65-69	-5.2506	0.4466	-11.7563	0.0000	-6.1261	-4.3751
age_group70-74	-5.7111	0.4544	-12.5686	0.0000	-6.6018	-4.8203
age_group75-79	-5.9076	0.4702	-12.5646	0.0000	-6.8292	-4.9859
age_group80+	-6.4995	0.4736	-13.7239	0.0000	-7.4278	-5.5711
income$15,000 to $25,000	-1.6357	0.4700	-3.4804	0.0005	-2.5570	-0.7144
income$25,000 to $35,000	-2.0746	0.4486	-4.6243	0.0000	-2.9540	-1.1952
income$35,000 to $50,000	-2.5455	0.4392	-5.7964	0.0000	-3.4064	-1.6847
income$50,000 to $100,000	-3.0430	0.4116	-7.3935	0.0000	-3.8498	-2.2362
income$100,000 to $200,000	-3.5097	0.4300	-8.1623	0.0000	-4.3526	-2.6668
income$200,000 or more	-3.8405	0.5010	-7.6651	0.0000	-4.8226	-2.8583
educationHigh school diploma or GED	0.1256	0.8124	0.1546	0.8772	-1.4669	1.7180
educationSome college or technical school	1.1179	0.6912	1.6172	0.1059	-0.2371	2.4729
educationCollege graduate	1.8179	0.6928	2.6239	0.0087	0.4598	3.1760
educationGraduate or professional degree	1.6691	0.6943	2.4040	0.0162	0.3081	3.0300
sexFemale	1.1855	0.1775	6.6784	0.0000	0.8376	1.5335
smoker_currentCurrent Smoker	1.5208	0.3654	4.1621	0.0000	0.8045	2.2371
sexFemale:smoker_currentCurrent Smoker	1.2833	0.5171	2.4819	0.0131	0.2697	2.2968

Report the estimated effect of current smoking on mentally unhealthy days among men (the coefficient for smoker_current). Interpret in plain language.

The estimated effect of current smoking on mentally unhealthy days among men is 1.5208. In plain language, for every additional day a male is a current smoker, there is an associated 1.5 day increase in the number of mentally unhealthy days they will report.

Compute and report the estimated effect among women (smoker_current coefficient plus the interaction coefficient). Interpret in plain language.

The estimated effect of current smoking on mentally unhealthy days among women is 1.2833. In plain language, for every additional day a female is a current smoker, there is an associated 1.3 day increase in the number of mentally unhealthy days they will report.

In 2 to 3 sentences, explain what these estimates mean together: is smoking more strongly associated with poor mental health in one sex than the other, and by how much?

To explain the meaning of these estimates together, it appears that smoking is more strongly associated with poor mental health in men compared to women by about 0.2 days. In other words, smoking appears to effect the association between poor mental health and sex by increasing the number of reported poor mental health days by about 0.2 days.

Step 5: Visualize the interaction using ggeffects::ggpredict(model_B, terms = c(“smoker_current”, “sex”)). The plot should show predicted mentally unhealthy days for each combination of smoking status and sex, with labeled axes, a descriptive title, and a legend identifying men and women.

int_B <- ggpredict(model_B, terms = c("smoker_current", "sex"))

ggplot(int_B, aes(x = x, y = predicted, color = group, fill = group, group = group)) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 2) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.12, color = NA) +
  labs(
    title    = "Predicted Mentally Unhealthy Days by Smoking Status and Sex",
    subtitle = "From interaction model: sex * smoker_current",
    x        = "Current Smoking Status",
    y        = "Predicted Mentally Unhealthy Days",
    color    = "Sex",
    fill     = "Sex"
  ) +
  theme_minimal(base_size = 13) +
  scale_color_brewer(palette = "Set1")

# Step 6: Write 3 to 5 sentences interpreting the interaction for a general public health audience. Do not include coefficient values or p-values. Focus on the substantive finding and its implications.

Based on our findings, it appears that females are more likely to report higher amounts of mentally unhealthy days compared to males. However, both males and females are more likely to report higher numbers of mentally unhealthy days as their smoking status goes from being a non-smoker to a current smoker. This reinforces the conclusion that smoking status meaningfully modifies the relationship between sex and mentally unhealthy days in this sample.

A. Findings and Limitations (6 points) What do the results suggest about the determinants of mental health among U.S. adults? Which predictors were most strongly associated? Which were not statistically significant? What are the key limitations of using cross-sectional survey data? Name at least two specific confounders that could bias the associations you estimated.

The results suggest that, at α = 0.05, all predictors included in the mental model (physhlth_days, bmi, sex, exercise, age_group, income, education, and smoking) have statistically significant partial associations to the outcome of mentally unhealth days. It also appears that having an age from 70-80+ and having an annual household income of $50,000 to 200,00 were the most strongly associated to the outcome after comparing the correlation coefficients observed in Table 2. The key limitations of using cross-sectional survey data are that you cannot establish temporality and that it is vulnerable to confounding, as we were unable to evaluate all variables outside the questions asked of participants. Two specific counfounders that could bias the estimated associations may include geographic location, and occupancy.

B. From Simple to Multiple Regression (4 points) What does Adjusted R-squared tell you that regular R-squared does not, and why is this distinction especially important when adding multiple categorical predictors? Why is it useful to test groups of predictors (income, education) collectively with chunk tests, rather than relying only on the individual t-test for each level?

From the simple to multiple regression, the adjusted R-squared tells you if a new predictor improves the fit of the model after accounting for the number of predictors you had included. Whereas a regular R-squared tells you how much of the variance in the outcome was caused by the predictors included in the model without penalization for the number of predictors. This distinction is especially important when adding multiple categorical predictors, because R-squared may increase simply from adding categories, thereby showing an increase in the outcome’s variance regardless of the importance of each additional predictor. It is useful to test groups of predictors collectively with chunk tests so that you can collectively test the significance of an entire predictor and all of its categories to the outcome, rather than testing each individual category within a variable to its reference group.

C. AI Transparency (5 points) Describe which parts of the assignment (if any) you sought AI assistance for, what specific prompts you used, and how you verified the AI-assisted output was correct. If you did not use AI, state that explicitly.

I used ChatGPT mainly for the recoding section and the section where we were asked to report the number and percentage of missing observations for menthlth_days and at least two other key variables, before removing any missing values for debugging purposes. The specific prompts I used were “What is wrong with my R code?” or “Explain this error”. I also sought help from ChatGPT for the last section, as I could not figure out why none of my regression lines were being added to the graph. This prompt was: “What can I add to my R code in order to get the lines to work for this graph?” I verified the AI-assisted output was correct by comparing it to the codes we had received during our lectures and comparing my output to others within the class.

Epi 533 Assignment #3

Morgan Wheat

2026-04-05

Part 0

Load required packages: tidyverse, haven, broom, kableExtra, car, gtsummary, ggeffects.

Find LLCP2023.XPT location using file.choose()

Import LLCP2023.XPT using read_xpt()

Report the total number of rows and columns in the raw dataset.

Select the nine variables listed in the table above. Use select()

Recode all variables as described above. Store all categorical variables as labeled factors.

Report the final analytic n.

Produce a descriptive statistics table using gtsummary::tbl_summary(), stratified by sex.

Part 1

Fit a multiple linear regression model with menthlth_days as the outcome and the following predictors: physhlth_days, bmi, sex, exercise, age_group, income, education, and smoking. Display the full summary() output.

Step #2: Write the fitted regression equation including all terms, with coefficients rounded to three decimal places

Step #3: Interpret the following coefficients in plain language. For each, state the direction, magnitude, and meaning in terms of mentally unhealthy days, holding all other variables constant

physhlth_days (continuous predictor)

bmi (continuous predictor)

sex: Female vs. Male (reference)

exercise: Yes vs. No (reference)

One age_group coefficient of your choice

Two income coefficients of your choice, compared to the reference category (Less than $15,000)

Step 4: Report and interpret each of the following model-level statistics

R-squared: proportion of variance in mentally unhealthy days explained by all predictors

Adjusted R-squared: how it differs from R-squared and what it adds

Root MSE (residual standard error): average prediction error of the model

Overall F-test: state H0, F-statistic, degrees of freedom (numerator and denominator), p-value, and conclusion

Part 2

Step 1: Obtain Type III (partial) sums of squares for all predictors using car::Anova(model, type = “III”). Display the full output.

Identify which predictors have statistically significant partial associations at α = 0.05.

Step 2: (Chunk test for income): Test whether income collectively improves the model significantly, after accounting for all other predictors.

Fit a reduced model that includes all predictors except income and a full model.

Use anova(model_reduced, model_full) to compare the two models.

State H0 and HA for this test clearly

Report the F-statistic, degrees of freedom, and p-value.

State your conclusion at α = 0.05

Step 3 (Chunk test for education): Repeat the same procedure for education as a group of predictors.

Fit a reduced model that includes all predictors except education and a full model.

Use anova(model_reduced, model_full) to compare the two models.

State H0 and HA for this test clearly

Report the F-statistic, degrees of freedom, and p-value.

State your conclusion at α = 0.05

Step 4: Write a 3 to 5 sentence synthesis interpreting the Type III results and your chunk test findings. Which predictors make the strongest independent contributions? What do the chunk tests add beyond the individual t-tests from summary()

Part 3

Step 1: Create a binary smoking variable called smoker_current (“Current smoker” includes current daily smokers and current some-day smokers) (“Non-smoker” includes former smokers and never smokers. Use “Non-smoker” as the reference category).

Step 2: Fit two models: Model A (main effects only): regress menthlth_days on physhlth_days, bmi, sex, smoker_current, exercise, age_group, income, and education. Model B (with interaction): same as Model A, but use sex * smoker_current in the formula to include the interaction term.

Step 3: Test whether the interaction is statistically significant:

Use anova(model_A, model_B) to compare the two models.

State H0 and HA for this test.

Report the F-statistic, degrees of freedom, and p-value.

State your conclusion at α = 0.05.

Interpret the interaction using the coefficients from Model B:

Report the estimated effect of current smoking on mentally unhealthy days among men (the coefficient for smoker_current). Interpret in plain language.

Compute and report the estimated effect among women (smoker_current coefficient plus the interaction coefficient). Interpret in plain language.

In 2 to 3 sentences, explain what these estimates mean together: is smoking more strongly associated with poor mental health in one sex than the other, and by how much?

Step 5: Visualize the interaction using ggeffects::ggpredict(model_B, terms = c(“smoker_current”, “sex”)). The plot should show predicted mentally unhealthy days for each combination of smoking status and sex, with labeled axes, a descriptive title, and a legend identifying men and women.

C. AI Transparency (5 points) Describe which parts of the assignment (if any) you sought AI assistance for, what specific prompts you used, and how you verified the AI-assisted output was correct. If you did not use AI, state that explicitly.