Part 0: Data Acquisition and Preparation (15 Points)

Step 1: Load Required Packages

library(tidyverse)
library(haven)
library(janitor)
library(broom)
library(knitr)
library(kableExtra)
library(car)
library(gtsummary)
library(ggeffects)

Step 2: Import the Raw BRFSS 2023 Data

brfss_raw <- read_xpt('/Users/emmanuelsmac/Desktop/LLCP2023.XPT ')

cat("Raw dataset dimensions:\n")
cat("  Rows:   ", nrow(brfss_raw), "\n")
cat("  Columns:", ncol(brfss_raw), "\n")

The raw 2023 BRFSS dataset contains 433,323 rows and 350 columns, representing one respondent per row across all U.S. states and territories for the 2023 survey year.

Step 3: Select and Recode All Variables

# ── CONFIRMED VARIABLE NAME MAPPING ──────────────────────────────────────────
# clean_names() in this version of janitor drops leading underscores entirely
# rather than replacing them with "x_". Exact names confirmed from your data:
#
#   Raw BRFSS name  →  clean_names() output
#   MENTHLTH        →  menthlth
#   PHYSHLTH        →  physhlth
#   _BMI5           →  bmi5          (NOT x_bmi5)
#   SEXVAR          →  sexvar
#   EXERANY2        →  exerany2
#   _AGEG5YR        →  ageg5yr       (NOT x_ageg5yr)
#   _INCOMG1        →  incomg1       (NOT x_incomg1)
#   EDUCA           →  educa
#   _SMOKER3        →  smoker3       (NOT x_smoker3)
# ─────────────────────────────────────────────────────────────────────────────

brfss_raw_clean <- brfss_raw |>
  clean_names()

# Diagnostic — verify exact names before selecting
cat("menthlth :", names(brfss_raw_clean)[str_detect(names(brfss_raw_clean), "^menthlth$")], "\n")
cat("physhlth :", names(brfss_raw_clean)[str_detect(names(brfss_raw_clean), "^physhlth$")], "\n")
cat("bmi5     :", names(brfss_raw_clean)[str_detect(names(brfss_raw_clean), "^bmi5$")], "\n")
cat("sexvar   :", names(brfss_raw_clean)[str_detect(names(brfss_raw_clean), "^sexvar$")], "\n")
cat("exerany2 :", names(brfss_raw_clean)[str_detect(names(brfss_raw_clean), "^exerany2$")], "\n")
cat("ageg5yr  :", names(brfss_raw_clean)[str_detect(names(brfss_raw_clean), "^ageg5yr$")], "\n")
cat("incomg1  :", names(brfss_raw_clean)[str_detect(names(brfss_raw_clean), "^incomg1$")], "\n")
cat("educa    :", names(brfss_raw_clean)[str_detect(names(brfss_raw_clean), "^educa$")], "\n")
cat("smoker3  :", names(brfss_raw_clean)[str_detect(names(brfss_raw_clean), "^smoker3$")], "\n")

# ── Select the nine required variables ───────────────────────────────────────
brfss_selected <- brfss_raw_clean |>
  select(
    menthlth,   # was MENTHLTH
    physhlth,   # was PHYSHLTH
    bmi5,       # was _BMI5    → confirmed: bmi5
    sexvar,     # was SEXVAR
    exerany2,   # was EXERANY2
    ageg5yr,    # was _AGEG5YR → confirmed: ageg5yr
    incomg1,    # was _INCOMG1 → confirmed: incomg1
    educa,      # was EDUCA
    smoker3     # was _SMOKER3 → confirmed: smoker3
  )

# ── Recode all nine variables ─────────────────────────────────────────────────
brfss_clean <- brfss_selected |>
  mutate(

    # Mentally unhealthy days: 88=None->0; 77/99=DK/Refused->NA; 1-30 kept
    menthlth_days = case_when(
      menthlth == 88                  ~ 0,
      menthlth >= 1 & menthlth <= 30  ~ as.numeric(menthlth),
      menthlth %in% c(77, 99)         ~ NA_real_,
      TRUE                            ~ NA_real_
    ),

    # Physically unhealthy days: same recoding rules
    physhlth_days = case_when(
      physhlth == 88                  ~ 0,
      physhlth >= 1 & physhlth <= 30  ~ as.numeric(physhlth),
      physhlth %in% c(77, 99)         ~ NA_real_,
      TRUE                            ~ NA_real_
    ),

    # BMI: stored as BMI × 100; divide by 100 for actual value; 9999->NA
    bmi = case_when(
      bmi5 == 9999 ~ NA_real_,
      TRUE         ~ as.numeric(bmi5) / 100
    ),

    # Sex: 1=Male (reference), 2=Female
    sex = factor(
      case_when(
        sexvar == 1 ~ "Male",
        sexvar == 2 ~ "Female",
        TRUE        ~ NA_character_
      ),
      levels = c("Male", "Female")
    ),

    # Exercise: 1=Yes, 2=No; 7/9->NA; No is reference
    exercise = factor(
      case_when(
        exerany2 == 1          ~ "Yes",
        exerany2 == 2          ~ "No",
        exerany2 %in% c(7, 9)  ~ NA_character_,
        TRUE                   ~ NA_character_
      ),
      levels = c("No", "Yes")
    ),

    # Age group: 13 five-year bands; 14->NA; 18-24 is reference
    age_group = factor(
      case_when(
        ageg5yr == 1  ~ "18-24",
        ageg5yr == 2  ~ "25-29",
        ageg5yr == 3  ~ "30-34",
        ageg5yr == 4  ~ "35-39",
        ageg5yr == 5  ~ "40-44",
        ageg5yr == 6  ~ "45-49",
        ageg5yr == 7  ~ "50-54",
        ageg5yr == 8  ~ "55-59",
        ageg5yr == 9  ~ "60-64",
        ageg5yr == 10 ~ "65-69",
        ageg5yr == 11 ~ "70-74",
        ageg5yr == 12 ~ "75-79",
        ageg5yr == 13 ~ "80+",
        ageg5yr == 14 ~ NA_character_,
        TRUE          ~ NA_character_
      ),
      levels = c("18-24","25-29","30-34","35-39","40-44","45-49",
                 "50-54","55-59","60-64","65-69","70-74","75-79","80+")
    ),

    # Income: 7 levels; 9->NA; "Less than $15,000" is reference
    income = factor(
      case_when(
        incomg1 == 1 ~ "Less than $15,000",
        incomg1 == 2 ~ "$15,000-$24,999",
        incomg1 == 3 ~ "$25,000-$34,999",
        incomg1 == 4 ~ "$35,000-$49,999",
        incomg1 == 5 ~ "$50,000-$99,999",
        incomg1 == 6 ~ "$100,000-$199,999",
        incomg1 == 7 ~ "$200,000 or more",
        incomg1 == 9 ~ NA_character_,
        TRUE         ~ NA_character_
      ),
      levels = c("Less than $15,000","$15,000-$24,999","$25,000-$34,999",
                 "$35,000-$49,999","$50,000-$99,999",
                 "$100,000-$199,999","$200,000 or more")
    ),

    # Education: 5 levels; 1-2 collapsed; 9->NA
    education = factor(
      case_when(
        educa %in% c(1, 2) ~ "Less than high school",
        educa == 3         ~ "High school diploma or GED",
        educa == 4         ~ "Some college or technical school",
        educa == 5         ~ "College graduate",
        educa == 6         ~ "Graduate or professional degree",
        educa == 9         ~ NA_character_,
        TRUE               ~ NA_character_
      ),
      levels = c("Less than high school",
                 "High school diploma or GED",
                 "Some college or technical school",
                 "College graduate",
                 "Graduate or professional degree")
    ),

    # Smoking status: 4 levels; 9->NA
    smoking = factor(
      case_when(
        smoker3 == 1 ~ "Current daily smoker",
        smoker3 == 2 ~ "Current some-day smoker",
        smoker3 == 3 ~ "Former smoker",
        smoker3 == 4 ~ "Never smoker",
        smoker3 == 9 ~ NA_character_,
        TRUE         ~ NA_character_
      ),
      levels = c("Current daily smoker","Current some-day smoker",
                 "Former smoker","Never smoker")
    )

  ) |>
  select(menthlth_days, physhlth_days, bmi, sex, exercise,
         age_group, income, education, smoking)

# Save immediately — avoids reloading the 300MB XPT file in future sessions
saveRDS(brfss_clean, '/Users/emmanuelsmac/Desktop/LLCP2023.XPT ')
cat("Saved: brfss_hw3_clean.rds —", nrow(brfss_clean), "rows,",
    ncol(brfss_clean), "columns\n")

Step 4: Load the Pre-Saved Clean Dataset

# Load from the saved RDS — runs in seconds, no need to reload the XPT file.
brfss_clean <- readRDS('/Users/emmanuelsmac/Desktop/LLCP2023.XPT ')

cat("Clean dataset dimensions:\n")

## Clean dataset dimensions:

cat("  Rows:    ", nrow(brfss_clean), "\n")

##   Rows:     433323

cat("  Columns: ", ncol(brfss_clean), "\n")

##   Columns:  9

cat("Variables: ", paste(names(brfss_clean), collapse = ", "), "\n")

## Variables:  menthlth_days, physhlth_days, bmi, sex, exercise, age_group, income, education, smoking

Step 5: Report Missingness Before Exclusions

miss_report <- brfss_clean |>
  summarise(
    across(
      everything(),
      list(
        N_missing   = ~ sum(is.na(.)),
        Pct_missing = ~ round(mean(is.na(.)) * 100, 1)
      ),
      .names = "{.col}__{.fn}"
    )
  ) |>
  pivot_longer(
    everything(),
    names_to  = c("Variable", ".value"),
    names_sep = "__"
  ) |>
  rename(`N Missing` = N_missing, `% Missing` = Pct_missing)

miss_report |>
  kable(
    caption   = "Table 0.1. Missing Data for All Nine Analysis Variables (Before Exclusions)",
    col.names = c("Variable", "N Missing", "% Missing")
  ) |>
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)

Table 0.1. Missing Data for All Nine Analysis Variables (Before Exclusions)
Variable	N Missing	% Missing
menthlth_days	8108	1.9
physhlth_days	10785	2.5
bmi	40535	9.4
sex	0	0.0
exercise	1251	0.3
age_group	7779	1.8
income	86623	20.0
education	2325	0.5
smoking	23062	5.3

Step 6: Create the Analytic Dataset

set.seed(1220)
brfss_analytic <- brfss_clean |>
  drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
          age_group, income, education, smoking) |>
  slice_sample(n = 8000)

cat("Final analytic sample size: n =", nrow(brfss_analytic), "\n")

## Final analytic sample size: n = 8000

Final analytic sample: After removing observations with missing values on any of the nine analysis variables and drawing a reproducible random sample using set.seed(1220), the analytic dataset contains n = 8,000 observations.

Step 7: Descriptive Statistics Table Stratified by Sex

brfss_analytic |>
  select(menthlth_days, physhlth_days, bmi, exercise,
         age_group, income, education, smoking, sex) |>
  tbl_summary(
    by = sex,
    label = list(
      menthlth_days ~ "Mentally unhealthy days (past 30)",
      physhlth_days ~ "Physically unhealthy days (past 30)",
      bmi           ~ "Body Mass Index (kg/m²)",
      exercise      ~ "Any exercise in past 30 days",
      age_group     ~ "Age group",
      income        ~ "Annual household income",
      education     ~ "Education level",
      smoking       ~ "Smoking status"
    ),
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits  = all_continuous() ~ 1,
    missing = "no"
  ) |>
  add_overall() |>
  add_p() |>
  bold_labels() |>
  italicize_levels() |>
  modify_caption(
    "**Table 0.2. Descriptive Statistics — BRFSS 2023 Analytic Sample (n = 8,000), Stratified by Sex**"
  ) |>
  as_flex_table()

**Table 0.2. Descriptive Statistics — BRFSS 2023 Analytic Sample (n = 8,000), Stratified by Sex**
Characteristic	Overall N = 8,0001	Male N = 3,9361	Female N = 4,0641	p-value2
Mentally unhealthy days (past 30)	4.3 (8.1)	3.5 (7.5)	5.1 (8.6)	<0.001
Physically unhealthy days (past 30)	4.4 (8.7)	4.0 (8.4)	4.9 (8.9)	<0.001
Body Mass Index (kg/m²)	28.7 (6.5)	28.7 (6.0)	28.7 (7.0)	0.008
Any exercise in past 30 days	6,240 (78%)	3,146 (80%)	3,094 (76%)	<0.001
Age group				<0.001
18-24	406 (5.1%)	235 (6.0%)	171 (4.2%)
25-29	408 (5.1%)	219 (5.6%)	189 (4.7%)
30-34	463 (5.8%)	253 (6.4%)	210 (5.2%)
35-39	565 (7.1%)	263 (6.7%)	302 (7.4%)
40-44	582 (7.3%)	290 (7.4%)	292 (7.2%)
45-49	518 (6.5%)	266 (6.8%)	252 (6.2%)
50-54	608 (7.6%)	305 (7.7%)	303 (7.5%)
55-59	660 (8.3%)	308 (7.8%)	352 (8.7%)
60-64	787 (9.8%)	408 (10%)	379 (9.3%)
65-69	901 (11%)	418 (11%)	483 (12%)
70-74	808 (10%)	382 (9.7%)	426 (10%)
75-79	663 (8.3%)	325 (8.3%)	338 (8.3%)
80+	631 (7.9%)	264 (6.7%)	367 (9.0%)
Annual household income				<0.001
Less than $15,000	407 (5.1%)	160 (4.1%)	247 (6.1%)
$15,000-$24,999	641 (8.0%)	271 (6.9%)	370 (9.1%)
$25,000-$34,999	871 (11%)	376 (9.6%)	495 (12%)
$35,000-$49,999	1,067 (13%)	482 (12%)	585 (14%)
$50,000-$99,999	2,511 (31%)	1,251 (32%)	1,260 (31%)
$100,000-$199,999	1,865 (23%)	996 (25%)	869 (21%)
$200,000 or more	638 (8.0%)	400 (10%)	238 (5.9%)
Education level				0.003
Less than high school	124 (1.6%)	75 (1.9%)	49 (1.2%)
High school diploma or GED	252 (3.2%)	130 (3.3%)	122 (3.0%)
Some college or technical school	1,827 (23%)	950 (24%)	877 (22%)
College graduate	2,138 (27%)	1,018 (26%)	1,120 (28%)
Graduate or professional degree	3,659 (46%)	1,763 (45%)	1,896 (47%)
Smoking status				<0.001
Current daily smoker	658 (8.2%)	339 (8.6%)	319 (7.8%)
Current some-day smoker	268 (3.4%)	151 (3.8%)	117 (2.9%)
Former smoker	2,262 (28%)	1,207 (31%)	1,055 (26%)
Never smoker	4,812 (60%)	2,239 (57%)	2,573 (63%)
1Mean (SD); n (%)
2Wilcoxon rank sum test; Pearson's Chi-squared test

Part 1: Multiple Linear Regression (25 Points)

Step 1: Fit the Full Multiple Linear Regression Model

mod_full <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + income + education + smoking,
  data = brfss_analytic
)

summary(mod_full)

## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise + 
##     age_group + income + education + smoking, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.080  -3.865  -1.597   0.712  30.471 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                9.65053    0.95407  10.115  < 2e-16
## physhlth_days                              0.26558    0.01007  26.384  < 2e-16
## bmi                                        0.03338    0.01321   2.527 0.011510
## sexFemale                                  1.39038    0.16750   8.301  < 2e-16
## exerciseYes                               -0.65116    0.21472  -3.033 0.002432
## age_group25-29                            -1.05660    0.51938  -2.034 0.041950
## age_group30-34                            -1.09395    0.50646  -2.160 0.030803
## age_group35-39                            -1.81103    0.48851  -3.707 0.000211
## age_group40-44                            -2.89307    0.48749  -5.935 3.07e-09
## age_group45-49                            -3.05618    0.49769  -6.141 8.61e-10
## age_group50-54                            -3.51674    0.48323  -7.277 3.72e-13
## age_group55-59                            -4.49597    0.47555  -9.454  < 2e-16
## age_group60-64                            -4.52215    0.45848  -9.863  < 2e-16
## age_group65-69                            -5.57795    0.45019 -12.390  < 2e-16
## age_group70-74                            -6.02536    0.45741 -13.173  < 2e-16
## age_group75-79                            -6.28656    0.47484 -13.239  < 2e-16
## age_group80+                              -6.81968    0.47684 -14.302  < 2e-16
## income$15,000-$24,999                     -1.63703    0.46899  -3.491 0.000485
## income$25,000-$34,999                     -2.07637    0.44797  -4.635 3.63e-06
## income$35,000-$49,999                     -2.54685    0.43819  -5.812 6.40e-09
## income$50,000-$99,999                     -3.05048    0.41069  -7.428 1.22e-13
## income$100,000-$199,999                   -3.49984    0.42923  -8.154 4.07e-16
## income$200,000 or more                    -3.78409    0.50036  -7.563 4.38e-14
## educationHigh school diploma or GED        0.07991    0.81066   0.099 0.921484
## educationSome college or technical school  1.07679    0.68973   1.561 0.118520
## educationCollege graduate                  1.79091    0.69119   2.591 0.009585
## educationGraduate or professional degree   1.73781    0.69250   2.509 0.012111
## smokingCurrent some-day smoker            -1.58670    0.53523  -2.965 0.003041
## smokingFormer smoker                      -1.98971    0.33713  -5.902 3.74e-09
## smokingNever smoker                       -2.93681    0.32162  -9.131  < 2e-16
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexFemale                                 ***
## exerciseYes                               ** 
## age_group25-29                            *  
## age_group30-34                            *  
## age_group35-39                            ***
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000-$24,999                     ***
## income$25,000-$34,999                     ***
## income$35,000-$49,999                     ***
## income$50,000-$99,999                     ***
## income$100,000-$199,999                   ***
## income$200,000 or more                    ***
## educationHigh school diploma or GED          
## educationSome college or technical school    
## educationCollege graduate                 ** 
## educationGraduate or professional degree  *  
## smokingCurrent some-day smoker            ** 
## smokingFormer smoker                      ***
## smokingNever smoker                       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared:  0.1853, Adjusted R-squared:  0.1824 
## F-statistic: 62.52 on 29 and 7970 DF,  p-value: < 2.2e-16

Step 2: Coefficients Table

tidy(mod_full, conf.int = TRUE) |>
  mutate(across(where(is.numeric), ~ round(., 4))) |>
  kable(
    caption   = "Table 1.1. Full Model Coefficient Estimates with 95% Confidence Intervals",
    col.names = c("Term","Estimate","SE","t-statistic","p-value","95% CI Lower","95% CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) |>
  scroll_box(height = "420px")

Table 1.1. Full Model Coefficient Estimates with 95% Confidence Intervals
Term	Estimate	SE	t-statistic	p-value	95% CI Lower	95% CI Upper
(Intercept)	9.6505	0.9541	10.1151	0.0000	7.7803	11.5208
physhlth_days	0.2656	0.0101	26.3841	0.0000	0.2459	0.2853
bmi	0.0334	0.0132	2.5274	0.0115	0.0075	0.0593
sexFemale	1.3904	0.1675	8.3007	0.0000	1.0620	1.7187
exerciseYes	-0.6512	0.2147	-3.0326	0.0024	-1.0721	-0.2303
age_group25-29	-1.0566	0.5194	-2.0343	0.0420	-2.0747	-0.0385
age_group30-34	-1.0939	0.5065	-2.1600	0.0308	-2.0867	-0.1012
age_group35-39	-1.8110	0.4885	-3.7072	0.0002	-2.7686	-0.8534
age_group40-44	-2.8931	0.4875	-5.9346	0.0000	-3.8487	-1.9375
age_group45-49	-3.0562	0.4977	-6.1408	0.0000	-4.0318	-2.0806
age_group50-54	-3.5167	0.4832	-7.2775	0.0000	-4.4640	-2.5695
age_group55-59	-4.4960	0.4755	-9.4543	0.0000	-5.4282	-3.5638
age_group60-64	-4.5221	0.4585	-9.8633	0.0000	-5.4209	-3.6234
age_group65-69	-5.5779	0.4502	-12.3903	0.0000	-6.4604	-4.6955
age_group70-74	-6.0254	0.4574	-13.1728	0.0000	-6.9220	-5.1287
age_group75-79	-6.2866	0.4748	-13.2392	0.0000	-7.2174	-5.3557
age_group80+	-6.8197	0.4768	-14.3019	0.0000	-7.7544	-5.8850
income$15,000-$24,999	-1.6370	0.4690	-3.4905	0.0005	-2.5564	-0.7177
income$25,000-$34,999	-2.0764	0.4480	-4.6351	0.0000	-2.9545	-1.1982
income$35,000-$49,999	-2.5469	0.4382	-5.8122	0.0000	-3.4058	-1.6879
income$50,000-$99,999	-3.0505	0.4107	-7.4277	0.0000	-3.8555	-2.2454
income$100,000-$199,999	-3.4998	0.4292	-8.1537	0.0000	-4.3413	-2.6584
income$200,000 or more	-3.7841	0.5004	-7.5628	0.0000	-4.7649	-2.8033
educationHigh school diploma or GED	0.0799	0.8107	0.0986	0.9215	-1.5092	1.6690
educationSome college or technical school	1.0768	0.6897	1.5612	0.1185	-0.2753	2.4288
educationCollege graduate	1.7909	0.6912	2.5911	0.0096	0.4360	3.1458
educationGraduate or professional degree	1.7378	0.6925	2.5095	0.0121	0.3803	3.0953
smokingCurrent some-day smoker	-1.5867	0.5352	-2.9645	0.0030	-2.6359	-0.5375
smokingFormer smoker	-1.9897	0.3371	-5.9020	0.0000	-2.6506	-1.3289
smokingNever smoker	-2.9368	0.3216	-9.1313	0.0000	-3.5673	-2.3063

Step 3: Fitted Regression Equation

coefs <- round(coef(mod_full), 3)
coefs

##                               (Intercept) 
##                                     9.651 
##                             physhlth_days 
##                                     0.266 
##                                       bmi 
##                                     0.033 
##                                 sexFemale 
##                                     1.390 
##                               exerciseYes 
##                                    -0.651 
##                            age_group25-29 
##                                    -1.057 
##                            age_group30-34 
##                                    -1.094 
##                            age_group35-39 
##                                    -1.811 
##                            age_group40-44 
##                                    -2.893 
##                            age_group45-49 
##                                    -3.056 
##                            age_group50-54 
##                                    -3.517 
##                            age_group55-59 
##                                    -4.496 
##                            age_group60-64 
##                                    -4.522 
##                            age_group65-69 
##                                    -5.578 
##                            age_group70-74 
##                                    -6.025 
##                            age_group75-79 
##                                    -6.287 
##                              age_group80+ 
##                                    -6.820 
##                     income$15,000-$24,999 
##                                    -1.637 
##                     income$25,000-$34,999 
##                                    -2.076 
##                     income$35,000-$49,999 
##                                    -2.547 
##                     income$50,000-$99,999 
##                                    -3.050 
##                   income$100,000-$199,999 
##                                    -3.500 
##                    income$200,000 or more 
##                                    -3.784 
##       educationHigh school diploma or GED 
##                                     0.080 
## educationSome college or technical school 
##                                     1.077 
##                 educationCollege graduate 
##                                     1.791 
##  educationGraduate or professional degree 
##                                     1.738 
##            smokingCurrent some-day smoker 
##                                    -1.587 
##                      smokingFormer smoker 
##                                    -1.990 
##                       smokingNever smoker 
##                                    -2.937

The fitted regression equation (coefficients rounded to three decimal places) is:

\[ \begin{aligned} \widehat{\text{MentalHealthDays}} =\ &9.651 \\ &+ 0.266 \cdot \text{PhysicalHealthDays} \\ &+ 0.033 \cdot \text{BMI} \\ &+ 1.39 \cdot \mathbb{1}[\text{Female}] \\ &+ -0.651 \cdot \mathbb{1}[\text{Exercise=Yes}] \\ &+ -1.057 \cdot \mathbb{1}[\text{25-29}] + -1.094 \cdot \mathbb{1}[\text{30-34}] + -1.811 \cdot \mathbb{1}[\text{35-39}] \\ &+ -2.893 \cdot \mathbb{1}[\text{40-44}] + -3.056 \cdot \mathbb{1}[\text{45-49}] + -3.517 \cdot \mathbb{1}[\text{50-54}] \\ &+ -4.496 \cdot \mathbb{1}[\text{55-59}] + -4.522 \cdot \mathbb{1}[\text{60-64}] + -5.578 \cdot \mathbb{1}[\text{65-69}] \\ &+ -6.025 \cdot \mathbb{1}[\text{70-74}] + -6.287 \cdot \mathbb{1}[\text{75-79}] + -6.82 \cdot \mathbb{1}[\text{80+}] \\ &+ -1.637 \cdot \mathbb{1}[\text{\$15k-\$25k}] + -2.076 \cdot \mathbb{1}[\text{\$25k-\$35k}] + -2.547 \cdot \mathbb{1}[\text{\$35k-\$50k}] \\ &+ -3.05 \cdot \mathbb{1}[\text{\$50k-\$100k}] + -3.5 \cdot \mathbb{1}[\text{\$100k-\$200k}] + -3.784 \cdot \mathbb{1}[\text{\$200k+}] \\ &+ 0.08 \cdot \mathbb{1}[\text{HS/GED}] + 1.077 \cdot \mathbb{1}[\text{Some college}] \\ &+ 1.791 \cdot \mathbb{1}[\text{College grad}] + 1.738 \cdot \mathbb{1}[\text{Grad degree}] \\ &+ -1.587 \cdot \mathbb{1}[\text{Some-day smoker}] + -1.99 \cdot \mathbb{1}[\text{Former smoker}] + -2.937 \cdot \mathbb{1}[\text{Never smoker}] \end{aligned} \]

Step 4: Coefficient Interpretations

physhlth_days (Continuous)

Each additional day of physical illness in the past 30 days is associated with an estimated0.266 additional mentally unhealthy day on average, holding all other predictors (BMI, sex, exercise, age group, income, education, and smoking) constant. This is the strongest continuous predictor in the model.

bmi (Continuous)

Each one-unit increase in BMI (kg/m²) is associated with an estimated change of 0.033 mentally unhealthy days, holding all other predictors constant. The small magnitude suggests a modest independent association after adjusting for the other variables.

sex: Female vs. Male (Reference)

Females report an estimated 1.39 more mentally unhealthy days on average compared to males (the reference group), holding all other variables constant.

exercise: Yes vs. No (Reference)

Adults who reported any exercise in the past 30 days report an estimated 0.651 fewer mentally unhealthy days compared to those reporting no exercise, holding all other variables constant.

Age Group: 25–29 vs. 18–24 (Reference)

Adults aged 25–29 report an estimated 1.057 fewer mentally unhealthy days compared to those aged 18–24 (the reference group), holding all other predictors constant.

Income: Two Coefficients vs. Less than $15,000 (Reference)

$50,000–$99,999 vs. Less than $15,000:

Adults earning $50,000–$99,999 annually report an estimated 3.05 fewer mentally unhealthy days compared to those earning less than $15,000, holding all other variables constant.

$200,000 or more vs. Less than $15,000:

Adults earning $200,000 or more report an estimated 3.784 fewer mentally unhealthy days compared to those earning less than $15,000, holding all other variables constant. This reflects the persistent socioeconomic gradient in mental health.

Step 5: Model-Level Statistics

mod_stats <- glance(mod_full)

tibble(
  Statistic = c(
    "R-squared",
    "Adjusted R-squared",
    "Root MSE (Residual Standard Error)",
    "Overall F-statistic",
    "Numerator df (p)",
    "Denominator df (n - p - 1)",
    "Overall p-value"
  ),
  Value = c(
    round(mod_stats$r.squared, 4),
    round(mod_stats$adj.r.squared, 4),
    round(mod_stats$sigma, 3),
    round(mod_stats$statistic, 2),
    mod_stats$df,
    mod_stats$df.residual,
    format.pval(mod_stats$p.value, digits = 3)
  )
) |>
  kable(caption = "Table 1.2. Model-Level Summary Statistics") |>
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)

Table 1.2. Model-Level Summary Statistics
Statistic	Value
R-squared	0.1853
Adjusted R-squared	0.1824
Root MSE (Residual Standard Error)	7.352
Overall F-statistic	62.52
Numerator df (p)	29
Denominator df (n - p - 1)	7970
Overall p-value	<2e-16

R-squared

$R^2$ =0.1853: approximately 18.5% of the total variability in mentally unhealthy days is explained by all predictors combined.

Adjusted R-squared

Adjusted $R^2$ =0.1824, slightly lower than $R^2$. Adjusted $R^2$ penalizes the model for each additional predictor added, correcting for the mechanical increase in $R^2$ that occurs even when predictors add no real information. The small gap confirms the predictors are genuinely contributing.

Root MSE

The residual standard error is 7.352 days — the typical absolute distance between an observed and a model-predicted value of mentally unhealthy days.

Overall F-test

\[H_0: \beta_1 = \beta_2 = \cdots = \beta_p = 0 \quad \text{(no predictor is associated with mentally unhealthy days)}\] \[H_A: \text{At least one } \beta_j \neq 0\]

$F(29, 7970) = 62.52$, $p < 0.001$. We reject $H_0$ and conclude that at least one predictor is significantly associated with mentally unhealthy days.

Part 2: Tests of Hypotheses (20 Points)

Step 1: Type III Partial Sums of Squares

anova_type3 <- Anova(mod_full, type = "III")

anova_type3 |>
  tidy() |>
  mutate(
    across(where(is.numeric), ~ round(., 4)),
    `Significant (alpha = 0.05)` = ifelse(p.value < 0.05, "Yes", "No")
  ) |>
  kable(
    caption   = "Table 2.1. Type III Partial F-Tests — All Predictors",
    col.names = c("Predictor","Sum of Sq","df","F value","p-value","Significant?")
  ) |>
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) |>
  column_spec(6, bold = TRUE)

Table 2.1. Type III Partial F-Tests — All Predictors
Predictor	Sum of Sq	df	F value	p-value	Significant?
(Intercept)	5529.7722	1	102.3152	0.0000	Yes
physhlth_days	37622.7952	1	696.1198	0.0000	Yes
bmi	345.2408	1	6.3879	0.0115	Yes
sex	3723.8662	1	68.9012	0.0000	Yes
exercise	497.0434	1	9.1966	0.0024	Yes
age_group	30092.1774	12	46.3986	0.0000	Yes
income	4733.8943	6	14.5982	0.0000	Yes
education	1265.1504	4	5.8521	0.0001	Yes
smoking	5101.1076	3	31.4613	0.0000	Yes
Residuals	430750.0872	7970	NA	NA	NA

Predictors marked Yes make a statistically significant independent contribution to predicting mentally unhealthy days, after adjusting for all other predictors simultaneously.

Step 2: Chunk Test — Does Income Collectively Improve the Model?

mod_no_income <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + education + smoking,
  data = brfss_analytic
)

ft_income <- anova(mod_no_income, mod_full)

ft_income |>
  tidy() |>
  mutate(across(where(is.numeric), ~ round(., 4))) |>
  kable(
    caption   = "Table 2.2. Chunk Test: Does Income as a Group Improve the Model?",
    col.names = c("Model","Res. df","RSS","df","Sum of Sq","F","p-value")
  ) |>
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)

Table 2.2. Chunk Test: Does Income as a Group Improve the Model?
Model	Res. df	RSS	df	Sum of Sq	F	p-value
menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + education + smoking	7976	435484.0	NA	NA	NA	NA
menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + education + smoking	7970	430750.1	6	4733.894	14.5982	0

f_inc   <- round(ft_income$F[2], 3)
p_inc   <- ft_income$`Pr(>F)`[2]
df1_inc <- ft_income$Df[2]
df2_inc <- ft_income$Res.Df[2]

Hypotheses:

\[H_0: \beta_{\$15k-\$25k} = \beta_{\$25k-\$35k} = \beta_{\$35k-\$50k} = \beta_{\$50k-\$100k} = \beta_{\$100k-\$200k} = \beta_{\$200k+} = 0\] \[H_A: \text{At least one income coefficient} \neq 0\]

$F(6, 7970) = 14.598$, $p < 0.001$. We reject $H_0$ at $\alpha = 0.05$. Income as a group adds statistically significant explanatory value for mentally unhealthy days beyond all other predictors.

Step 3: Chunk Test — Does Education Collectively Improve the Model?

mod_no_educ <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + income + smoking,
  data = brfss_analytic
)

ft_educ <- anova(mod_no_educ, mod_full)

ft_educ |>
  tidy() |>
  mutate(across(where(is.numeric), ~ round(., 4))) |>
  kable(
    caption   = "Table 2.3. Chunk Test: Does Education as a Group Improve the Model?",
    col.names = c("Model","Res. df","RSS","df","Sum of Sq","F","p-value")
  ) |>
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)

Table 2.3. Chunk Test: Does Education as a Group Improve the Model?
Model	Res. df	RSS	df	Sum of Sq	F	p-value
menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + smoking	7974	432015.2	NA	NA	NA	NA
menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + education + smoking	7970	430750.1	4	1265.15	5.8521	1e-04

f_educ   <- round(ft_educ$F[2], 3)
p_educ   <- ft_educ$`Pr(>F)`[2]
df1_educ <- ft_educ$Df[2]
df2_educ <- ft_educ$Res.Df[2]

Hypotheses:

\[H_0: \beta_{\text{HS/GED}} = \beta_{\text{Some college}} = \beta_{\text{College grad}} = \beta_{\text{Grad degree}} = 0\] \[H_A: \text{At least one education coefficient} \neq 0\]

$F(4, 7970) = 5.852$, $p < 0.001$. We reject $H_0$ at $\alpha = 0.05$. Education as a group adds statistically significant explanatory value for mentally unhealthy days beyond all other predictors.

Step 4: Written Synthesis

The Type III partial F-tests confirm that physical health days, sex, exercise, age group, income, education, and smoking each make statistically significant independent contributions to predicting mentally unhealthy days after adjusting for all other predictors simultaneously. Physical health days and smoking status were among the most substantively notable predictors. The chunk tests for income and education extend the individual t-test results in an important way: even when individual dummy coefficients within a multi-level categorical variable do not all reach significance on their own which can happen when adjacent categories are closely spaced the variable as a whole may still explain meaningful variation. Testing income and education collectively with anova() confirms both variables contribute significant group-level predictive value, providing stronger justification for their inclusion than relying on any single dummy coefficient alone.

Part 3: Interaction Analysis (25 Points)

Step 1: Create Binary Smoking Variable

brfss_analytic <- brfss_analytic |>
  mutate(
    smoker_current = factor(
      case_when(
        smoking %in% c("Current daily smoker",
                        "Current some-day smoker") ~ "Current smoker",
        smoking %in% c("Former smoker",
                        "Never smoker")            ~ "Non-smoker",
        TRUE                                       ~ NA_character_
      ),
      levels = c("Non-smoker", "Current smoker")   # Non-smoker is reference
    )
  )

table(brfss_analytic$smoker_current, useNA = "always")

## 
##     Non-smoker Current smoker           <NA> 
##           7074            926              0

Step 2: Fit Model A (Main Effects) and Model B (With Interaction)

mod_A <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + smoker_current +
    exercise + age_group + income + education,
  data = brfss_analytic
)

tidy(mod_A, conf.int = TRUE) |>
  mutate(across(where(is.numeric), ~ round(., 4))) |>
  kable(
    caption   = "Table 3.1. Model A — Main Effects Only",
    col.names = c("Term","Estimate","SE","t","p-value","95% CI Lower","95% CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) |>
  scroll_box(height = "360px")

Table 3.1. Model A — Main Effects Only
Term	Estimate	SE	t	p-value	95% CI Lower	95% CI Upper
(Intercept)	6.7553	0.9271	7.2869	0.0000	4.9381	8.5726
physhlth_days	0.2686	0.0101	26.6731	0.0000	0.2489	0.2884
bmi	0.0334	0.0132	2.5250	0.0116	0.0075	0.0593
sexFemale	1.3331	0.1673	7.9675	0.0000	1.0051	1.6611
smoker_currentCurrent smoker	2.1287	0.2712	7.8489	0.0000	1.5971	2.6604
exerciseYes	-0.6725	0.2150	-3.1274	0.0018	-1.0940	-0.2510
age_group25-29	-0.9149	0.5198	-1.7602	0.0784	-1.9338	0.1040
age_group30-34	-0.8823	0.5061	-1.7434	0.0813	-1.8743	0.1097
age_group35-39	-1.5810	0.4877	-3.2421	0.0012	-2.5369	-0.6251
age_group40-44	-2.6157	0.4858	-5.3847	0.0000	-3.5680	-1.6635
age_group45-49	-2.8246	0.4970	-5.6836	0.0000	-3.7988	-1.8504
age_group50-54	-3.2600	0.4821	-6.7628	0.0000	-4.2050	-2.3151
age_group55-59	-4.2301	0.4741	-8.9219	0.0000	-5.1595	-3.3007
age_group60-64	-4.2484	0.4568	-9.3000	0.0000	-5.1439	-3.3529
age_group65-69	-5.2338	0.4467	-11.7163	0.0000	-6.1095	-4.3582
age_group70-74	-5.7023	0.4545	-12.5457	0.0000	-6.5933	-4.8113
age_group75-79	-5.8977	0.4703	-12.5401	0.0000	-6.8197	-4.9758
age_group80+	-6.4888	0.4737	-13.6975	0.0000	-7.4174	-5.5602
income$15,000-$24,999	-1.6797	0.4698	-3.5754	0.0004	-2.6007	-0.7588
income$25,000-$34,999	-2.1023	0.4486	-4.6861	0.0000	-2.9817	-1.2229
income$35,000-$49,999	-2.5869	0.4390	-5.8931	0.0000	-3.4474	-1.7264
income$50,000-$99,999	-3.0823	0.4114	-7.4921	0.0000	-3.8887	-2.2758
income$100,000-$199,999	-3.5360	0.4300	-8.2232	0.0000	-4.3789	-2.6931
income$200,000 or more	-3.8625	0.5011	-7.7078	0.0000	-4.8448	-2.8802
educationHigh school diploma or GED	0.2139	0.8119	0.2634	0.7922	-1.3776	1.8053
educationSome college or technical school	1.1965	0.6907	1.7323	0.0833	-0.1575	2.5505
educationCollege graduate	1.9035	0.6922	2.7499	0.0060	0.5466	3.2604
educationGraduate or professional degree	1.7456	0.6938	2.5160	0.0119	0.3856	3.1057

# The * operator includes sex, smoker_current, AND their interaction term
mod_B <- lm(
  menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
    exercise + age_group + income + education,
  data = brfss_analytic
)

tidy(mod_B, conf.int = TRUE) |>
  mutate(across(where(is.numeric), ~ round(., 4))) |>
  kable(
    caption   = "Table 3.2. Model B — With sex x smoker_current Interaction",
    col.names = c("Term","Estimate","SE","t","p-value","95% CI Lower","95% CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) |>
  scroll_box(height = "400px")

Table 3.2. Model B — With sex x smoker_current Interaction
Term	Estimate	SE	t	p-value	95% CI Lower	95% CI Upper
(Intercept)	6.8994	0.9286	7.4301	0.0000	5.0791	8.7196
physhlth_days	0.2686	0.0101	26.6788	0.0000	0.2489	0.2883
bmi	0.0331	0.0132	2.5017	0.0124	0.0072	0.0590
sexFemale	1.1855	0.1775	6.6784	0.0000	0.8376	1.5335
smoker_currentCurrent smoker	1.5208	0.3654	4.1621	0.0000	0.8045	2.2371
exerciseYes	-0.6728	0.2150	-3.1301	0.0018	-1.0942	-0.2515
age_group25-29	-0.9202	0.5196	-1.7709	0.0766	-1.9388	0.0984
age_group30-34	-0.8924	0.5059	-1.7640	0.0778	-1.8841	0.0993
age_group35-39	-1.5929	0.4875	-3.2673	0.0011	-2.5485	-0.6372
age_group40-44	-2.6286	0.4856	-5.4127	0.0000	-3.5806	-1.6766
age_group45-49	-2.8425	0.4969	-5.7209	0.0000	-3.8165	-1.8686
age_group50-54	-3.2778	0.4820	-6.8012	0.0000	-4.2226	-2.3331
age_group55-59	-4.2499	0.4740	-8.9652	0.0000	-5.1791	-3.3206
age_group60-64	-4.2640	0.4567	-9.3364	0.0000	-5.1593	-3.3688
age_group65-69	-5.2506	0.4466	-11.7563	0.0000	-6.1261	-4.3751
age_group70-74	-5.7111	0.4544	-12.5686	0.0000	-6.6018	-4.8203
age_group75-79	-5.9076	0.4702	-12.5646	0.0000	-6.8292	-4.9859
age_group80+	-6.4995	0.4736	-13.7239	0.0000	-7.4278	-5.5711
income$15,000-$24,999	-1.6357	0.4700	-3.4804	0.0005	-2.5570	-0.7144
income$25,000-$34,999	-2.0746	0.4486	-4.6243	0.0000	-2.9540	-1.1952
income$35,000-$49,999	-2.5455	0.4392	-5.7964	0.0000	-3.4064	-1.6847
income$50,000-$99,999	-3.0430	0.4116	-7.3935	0.0000	-3.8498	-2.2362
income$100,000-$199,999	-3.5097	0.4300	-8.1623	0.0000	-4.3526	-2.6668
income$200,000 or more	-3.8405	0.5010	-7.6651	0.0000	-4.8226	-2.8583
educationHigh school diploma or GED	0.1256	0.8124	0.1546	0.8772	-1.4669	1.7180
educationSome college or technical school	1.1179	0.6912	1.6172	0.1059	-0.2371	2.4729
educationCollege graduate	1.8179	0.6928	2.6239	0.0087	0.4598	3.1760
educationGraduate or professional degree	1.6691	0.6943	2.4040	0.0162	0.3081	3.0300
sexFemale:smoker_currentCurrent smoker	1.2833	0.5171	2.4819	0.0131	0.2697	2.2968

Step 3: Test Whether the Interaction Is Statistically Significant

int_test <- anova(mod_A, mod_B)

int_test |>
  tidy() |>
  mutate(across(where(is.numeric), ~ round(., 4))) |>
  kable(
    caption   = "Table 3.3. Partial F-Test: Model A (no interaction) vs. Model B (with interaction)",
    col.names = c("Model","Res. df","RSS","df","Sum of Sq","F","p-value")
  ) |>
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)

Table 3.3. Partial F-Test: Model A (no interaction) vs. Model B (with interaction)
Model	Res. df	RSS	df	Sum of Sq	F	p-value
menthlth_days ~ physhlth_days + bmi + sex + smoker_current + exercise + age_group + income + education	7972	432508.9	NA	NA	NA	NA
menthlth_days ~ physhlth_days + bmi + sex * smoker_current + exercise + age_group + income + education	7971	432174.9	1	333.9749	6.1598	0.0131

f_int   <- round(int_test$F[2], 3)
p_int   <- int_test$`Pr(>F)`[2]
df1_int <- int_test$Df[2]
df2_int <- int_test$Res.Df[2]

Hypotheses:

\[H_0: \beta_{\text{Female} \times \text{Current smoker}} = 0 \quad \text{(smoking–mental health association is the same for men and women)}\]

\[H_A: \beta_{\text{Female} \times \text{Current smoker}} \neq 0 \quad \text{(the association differs by sex)}\]

$F(1, 7971) = 6.16$, $p = 0.0131$. We reject $H_0$ at $\alpha = 0.05$ and conclude that the association between current smoking and mentally unhealthy days differs significantly by sex.

Step 4: Interpret the Interaction Coefficients

b_B <- round(coef(mod_B), 3)

smoking_men   <- b_B["smoker_currentCurrent smoker"]
smoking_women <- b_B["smoker_currentCurrent smoker"] +
                 b_B["sexFemale:smoker_currentCurrent smoker"]

cat("Estimated association of current smoking with mentally unhealthy days:\n")

## Estimated association of current smoking with mentally unhealthy days:

cat("  Among men   :", round(smoking_men,   3), "days\n")

##   Among men   : 1.521 days

cat("  Among women :", round(smoking_women, 3), "days\n")

##   Among women : 2.804 days

cat("  Difference  :", round(smoking_women - smoking_men, 3), "days\n")

##   Difference  : 1.283 days

Among men (reference sex group): Male current smokers report an estimated 1.521 days more mentally unhealthy days compared to male non-smokers, holding all other predictors constant.

Among women: The estimated association of current smoking for women = 1.521 + 1.283 = 2.804 days. Female current smokers report an estimated 2.804 more mentally unhealthy days compared to female non-smokers, holding all other variables constant.

Together: The difference in the smoking effect between women and men is 1.283 days. Current smoking is more strongly associated with mentally unhealthy days among women than among men.

Step 5: Visualize the Interaction

pred_int <- ggpredict(mod_B, terms = c("smoker_current", "sex"))

# Convert to data frame for full control over the plot
pred_df <- as.data.frame(pred_int)

ggplot(pred_df, aes(x = x, y = predicted, color = group, group = group)) +
  geom_point(size = 4, position = position_dodge(width = 0.3)) +
  geom_line(linewidth = 1.2, position = position_dodge(width = 0.3)) +
  geom_errorbar(
    aes(ymin = conf.low, ymax = conf.high),
    width    = 0.15,
    linewidth = 0.8,
    position = position_dodge(width = 0.3)
  ) +
  scale_color_brewer(palette = "Set1", name = "Sex") +
  labs(
    title    = "Predicted Mentally Unhealthy Days by Smoking Status and Sex",
    subtitle = "Model B: sex x smoker_current interaction, adjusted for all other predictors",
    x        = "Smoking Status",
    y        = "Predicted Mentally Unhealthy Days (Past 30)",
    caption  = "Points = model-adjusted predictions; error bars = 95% confidence intervals."
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "right")

Figure 3.1. Predicted Mentally Unhealthy Days by Smoking Status and Sex (Model B)

Step 6: Public Health Interpretation

Among U.S. adults, current smokers report more mentally unhealthy days on average than former or never smokers, even after accounting for age, sex, income, education, physical health, and exercise. The strength of this association appears to differ between men and women, with women who smoke experiencing a larger gap in mental health burden compared to non-smoking women, relative to the corresponding gap among men. These findings suggest that public health programs addressing both tobacco use and mental health may benefit from sex-specific approaches. Because this analysis is cross-sectional, we cannot determine the direction of causality — poor mental health may increase the likelihood of smoking, smoking may worsen mental health, or both may reflect shared upstream causes.

Part 4: Reflection (15 Points)

Reflection

The results suggest that mentally unhealthy days among U.S. adults are associated with a wide range of individual, behavioral, and socioeconomic factors. Physical health days emerged as the strongest predictor, consistent with the extensive literature documenting the bidirectional relationship between physical illness and psychological distress. Current smokers — particularly daily smokers — reported substantially more mentally unhealthy days than former or never smokers even after controlling for other variables, and the interaction analysis indicates this association may differ by sex. Income and education both showed clear inverse gradients, with lower socioeconomic status linked to higher mental health burden, reinforcing the central role of social determinants in population mental health. Predictors with weaker associations such as BMI may operate through intermediate pathways not captured in this cross-sectional model. The primary limitation is the study’s cross-sectional design: because exposure and outcome are measured simultaneously, we cannot establish temporal precedence or rule out reverse causality — for example, we cannot determine whether smoking worsens mental health or whether psychological distress drives smoking initiation. The BRFSS also relies on self-reported telephone interviews, which are subject to recall bias and social desirability bias for sensitive outcomes. Two specific confounders likely to bias the estimated associations include social support networks, individuals with weaker social ties are simultaneously more likely to smoke and to experience psychological distress — and pre-existing mental health diagnoses, people with diagnosed depression or anxiety have markedly elevated smoking rates, which would inflate the observed smoking–mental health association beyond any causal pathway.

Adjusted $R^2$ offers an important correction that raw $R^2$ does not: because $R^2$ can only increase as predictors are added, including even uninformative variables will artificially improve apparent fit. In a model with many categorical predictors — each generating multiple dummy variables — this inflation can be substantial. Adjusted $R^2$ penalizes each additional degree of freedom consumed, providing a more honest assessment of explanatory power. The small gap between $R^2$ (0.1853) and Adjusted $R^2$ (0.1824) confirms the predictors are genuinely informative rather than noise. Chunk tests address a complementary limitation of individual t-tests: when a categorical predictor is represented by multiple dummy variables, individual coefficients may not reach significance even when the variable as a whole is strongly predictive, because each comparison targets only a narrow contrast against the reference group. Testing income and education collectively using anova() asks the more appropriate question — does the variable as a whole explain meaningful variation? — and both passed this test decisively.

AI Transparency:

I used AI assistance (Claude) for replacing deprecated geom_errorbarh() with geom_errorbar(orientation = "y"). I struggled trying to get the plot. Also when I completed the work I had an error when trying to knit so I checked out and realized there was a chuck to be removed which was done on chatgpt with the prompt I had.

EPI 553 — Homework 3: Multiple Linear Regression, Tests of Hypotheses, and Interaction Analysis

EPI 553: Principles of Statistical Inference II | Spring 2026

Emmanuel Nana Arko

04/05/2026

Part 0: Data Acquisition and Preparation (15 Points)

Step 1: Load Required Packages

Step 2: Import the Raw BRFSS 2023 Data

Step 3: Select and Recode All Variables

Step 4: Load the Pre-Saved Clean Dataset

Step 5: Report Missingness Before Exclusions

Step 6: Create the Analytic Dataset

Step 7: Descriptive Statistics Table Stratified by Sex

Part 1: Multiple Linear Regression (25 Points)

Step 1: Fit the Full Multiple Linear Regression Model

Step 2: Coefficients Table

Step 3: Fitted Regression Equation

Step 5: Model-Level Statistics

Part 2: Tests of Hypotheses (20 Points)

Step 1: Type III Partial Sums of Squares

Step 2: Chunk Test — Does Income Collectively Improve the Model?

Hypotheses:

Step 3: Chunk Test — Does Education Collectively Improve the Model?

Step 4: Written Synthesis

Part 3: Interaction Analysis (25 Points)

Step 1: Create Binary Smoking Variable

Step 2: Fit Model A (Main Effects) and Model B (With Interaction)

Step 3: Test Whether the Interaction Is Statistically Significant

Hypotheses:

Step 4: Interpret the Interaction Coefficients

Step 5: Visualize the Interaction

Step 6: Public Health Interpretation

Part 4: Reflection (15 Points)

Reflection

AI Transparency: