# Load the dataset
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tibble' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'stringr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
## Warning: package 'broom' was built under R version 4.4.3
library(knitr)
## Warning: package 'knitr' was built under R version 4.4.3
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.4.3
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.4.3
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(ggeffects)
## Warning: package 'ggeffects' was built under R version 4.4.3
library(haven)
## Warning: package 'haven' was built under R version 4.4.3
brfss_raw <- read_xpt("/Users/sarah/OneDrive/Documents/EPI 553/data/LLCP2023.XPT")
nrow(brfss_raw)
## [1] 433323
ncol(brfss_raw)
## [1] 350
#selecting required variables
brfss_raw <- brfss_raw |>
  select(MENTHLTH,
    PHYSHLTH,
    `_BMI5`,
    SEXVAR,
    EXERANY2,
    `_AGEG5YR`,
    `_INCOMG1`,
    EDUCA,
    `_SMOKER3`)

There are 433,323 rows and 350 columns in the raw dataset.

brfss_clean <- brfss_raw |>
  mutate(
    #Mentally unhealthy days
    menthlth_days = case_when(
      MENTHLTH == 88                    ~ 0,
      MENTHLTH >= 1 & MENTHLTH <= 30   ~ as.numeric(MENTHLTH),
      TRUE                             ~ NA_real_
    ),
    #Physically unhealthy days
    physhlth_days = case_when(
      PHYSHLTH == 88                    ~ 0,
      PHYSHLTH >= 1 & PHYSHLTH <= 30   ~ as.numeric(PHYSHLTH),
      TRUE                             ~ NA_real_
    ),
    #BMI
    bmi = ifelse(`_BMI5` == 9999, NA_real_, `_BMI5` /100),
    #Sex
   sex = factor(SEXVAR, levels = c(1, 2), labels = c("Male", "Female")),
   #Exercise
   exercise = case_when(
     EXERANY2 == 1 ~ "Yes",
     EXERANY2 == 2 ~ "No",
     EXERANY2 %in% c(7,9) ~ NA_character_),
   #Age group
   age_group = factor(
     ifelse(`_AGEG5YR` == 14, NA, `_AGEG5YR`),
     levels = 1:13,
     labels = c("18-24","25-29","30-34","35-39","40-44","45-49",
    "50-54","55-59","60-64","65-69","70-74","75-79","80+")
    ),
     #Income
     income = factor(
       ifelse(`_INCOMG1` == 9, NA, `_INCOMG1`),
       levels = 1:7,
       labels = c("Less than $15,000",
        "$15,000–$24,999",
        "$25,000–$34,999",
        "$35,000–$49,999",
        "$50,000–$74,999",
        "$75,000–$199,999",
        "$200,000 or more")

     ),
     #Education
     education= case_when(EDUCA %in% 1:2 ~ "Less than high school",
      EDUCA == 3 ~ "High school diploma or GED",
      EDUCA == 4 ~ "Some college or technical school",
      EDUCA == 5 ~ "College graduate",
      EDUCA == 6 ~ "Graduate or professional degree",
      EDUCA == 9 ~ NA_character_
    ),
     #Smoking
     smoking = factor(
       ifelse(`_SMOKER3` == 9, NA, `_SMOKER3`),
      levels = 1:4,
      labels = c(
        "Current daily smoker",
        "Current some-day smoker",
        "Former smoker",
        "Never smoker"
     )
    )
)

#Missingness 
brfss_clean |>
  summarise(
    n_missing_menthlth = sum(is.na(menthlth_days)),
    pct_missing_menthlth = mean(is.na(menthlth_days)) * 100,
    n_missing_physhlth = sum(is.na(physhlth_days)),
    pct_missing_physhlth = mean(is.na(physhlth_days)) * 100,
    n_missing_bmi = sum(is.na(bmi)),
    pct_missing_bmi = mean(is.na(bmi)) * 100
  ) |>
glimpse()
## Rows: 1
## Columns: 6
## $ n_missing_menthlth   <int> 8108
## $ pct_missing_menthlth <dbl> 1.871122
## $ n_missing_physhlth   <int> 10785
## $ pct_missing_physhlth <dbl> 2.488906
## $ n_missing_bmi        <int> 40535
## $ pct_missing_bmi      <dbl> 9.354454

Before removing any missing values, 8,108 observations (1.87%) were missing menthlth_days, and 10,785 observations (2.49%) were missing physhlth_days. Missingness for BMI was higher, with 40,535 observations (9.35%) missing.

set.seed(1220)
brfss_analytic <- brfss_clean |>
  drop_na(menthlth_days, physhlth_days, bmi, sex, exercise, age_group, income, education, smoking) |>
  slice_sample(n=8000)

saveRDS(brfss_clean, 
"brfss_hw3_clean.rds")
brfss_analytic |>
  select(sex, menthlth_days, physhlth_days, bmi, exercise, age_group, income, education, smoking) |>
  tbl_summary(
    by = sex,
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    )
    )|>
  add_n() |>
  bold_labels() |>
  modify_caption("**Table 1. Descriptive Statistics**") |>
  as_flex_table()
**Table 1. Descriptive Statistics**

Characteristic

N

Male
N = 3,9361

Female
N = 4,0641

menthlth_days

8,000

4 (8)

5 (9)

physhlth_days

8,000

4 (8)

5 (9)

bmi

8,000

29 (6)

29 (7)

exercise

8,000

3,146 (80%)

3,094 (76%)

age_group

8,000

18-24

235 (6.0%)

171 (4.2%)

25-29

219 (5.6%)

189 (4.7%)

30-34

253 (6.4%)

210 (5.2%)

35-39

263 (6.7%)

302 (7.4%)

40-44

290 (7.4%)

292 (7.2%)

45-49

266 (6.8%)

252 (6.2%)

50-54

305 (7.7%)

303 (7.5%)

55-59

308 (7.8%)

352 (8.7%)

60-64

408 (10%)

379 (9.3%)

65-69

418 (11%)

483 (12%)

70-74

382 (9.7%)

426 (10%)

75-79

325 (8.3%)

338 (8.3%)

80+

264 (6.7%)

367 (9.0%)

income

8,000

Less than $15,000

160 (4.1%)

247 (6.1%)

$15,000–$24,999

271 (6.9%)

370 (9.1%)

$25,000–$34,999

376 (9.6%)

495 (12%)

$35,000–$49,999

482 (12%)

585 (14%)

$50,000–$74,999

1,251 (32%)

1,260 (31%)

$75,000–$199,999

996 (25%)

869 (21%)

$200,000 or more

400 (10%)

238 (5.9%)

education

8,000

College graduate

1,018 (26%)

1,120 (28%)

Graduate or professional degree

1,763 (45%)

1,896 (47%)

High school diploma or GED

130 (3.3%)

122 (3.0%)

Less than high school

75 (1.9%)

49 (1.2%)

Some college or technical school

950 (24%)

877 (22%)

smoking

8,000

Current daily smoker

339 (8.6%)

319 (7.8%)

Current some-day smoker

151 (3.8%)

117 (2.9%)

Former smoker

1,207 (31%)

1,055 (26%)

Never smoker

2,239 (57%)

2,573 (63%)

1Mean (SD); n (%)

brfss_mlr <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + education + smoking, data = brfss_analytic)

summary(brfss_mlr)
## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise + 
##     age_group + income + education + smoking, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.080  -3.865  -1.597   0.712  30.471 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                               11.44144    0.73022  15.669  < 2e-16
## physhlth_days                              0.26558    0.01007  26.384  < 2e-16
## bmi                                        0.03338    0.01321   2.527 0.011510
## sexFemale                                  1.39038    0.16750   8.301  < 2e-16
## exerciseYes                               -0.65116    0.21472  -3.033 0.002432
## age_group25-29                            -1.05660    0.51938  -2.034 0.041950
## age_group30-34                            -1.09395    0.50646  -2.160 0.030803
## age_group35-39                            -1.81103    0.48851  -3.707 0.000211
## age_group40-44                            -2.89307    0.48749  -5.935 3.07e-09
## age_group45-49                            -3.05618    0.49769  -6.141 8.61e-10
## age_group50-54                            -3.51674    0.48323  -7.277 3.72e-13
## age_group55-59                            -4.49597    0.47555  -9.454  < 2e-16
## age_group60-64                            -4.52215    0.45848  -9.863  < 2e-16
## age_group65-69                            -5.57795    0.45019 -12.390  < 2e-16
## age_group70-74                            -6.02536    0.45741 -13.173  < 2e-16
## age_group75-79                            -6.28656    0.47484 -13.239  < 2e-16
## age_group80+                              -6.81968    0.47684 -14.302  < 2e-16
## income$15,000–$24,999                     -1.63703    0.46899  -3.491 0.000485
## income$25,000–$34,999                     -2.07637    0.44797  -4.635 3.63e-06
## income$35,000–$49,999                     -2.54685    0.43819  -5.812 6.40e-09
## income$50,000–$74,999                     -3.05048    0.41069  -7.428 1.22e-13
## income$75,000–$199,999                    -3.49984    0.42923  -8.154 4.07e-16
## income$200,000 or more                    -3.78409    0.50036  -7.563 4.38e-14
## educationGraduate or professional degree  -0.05310    0.21116  -0.251 0.801448
## educationHigh school diploma or GED       -1.71101    0.49969  -3.424 0.000620
## educationLess than high school            -1.79091    0.69119  -2.591 0.009585
## educationSome college or technical school -0.71412    0.23816  -2.998 0.002722
## smokingCurrent some-day smoker            -1.58670    0.53523  -2.965 0.003041
## smokingFormer smoker                      -1.98971    0.33713  -5.902 3.74e-09
## smokingNever smoker                       -2.93681    0.32162  -9.131  < 2e-16
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexFemale                                 ***
## exerciseYes                               ** 
## age_group25-29                            *  
## age_group30-34                            *  
## age_group35-39                            ***
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000–$24,999                     ***
## income$25,000–$34,999                     ***
## income$35,000–$49,999                     ***
## income$50,000–$74,999                     ***
## income$75,000–$199,999                    ***
## income$200,000 or more                    ***
## educationGraduate or professional degree     
## educationHigh school diploma or GED       ***
## educationLess than high school            ** 
## educationSome college or technical school ** 
## smokingCurrent some-day smoker            ** 
## smokingFormer smoker                      ***
## smokingNever smoker                       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared:  0.1853, Adjusted R-squared:  0.1824 
## F-statistic: 62.52 on 29 and 7970 DF,  p-value: < 2.2e-16

Step 2: Fitted Regression Equation: \[ \hat{\menthlth_days} = 11.441 + 0.266(physhlth_days) + 0.033(bmi) + 1.390(female) + -0.651(exereciseYes) + -1.057(Age 25-29) + -1.094(Age 30-34) + -1.811 (Age 35-39) + -2.893(Age 40-44) + -3.056(Age 45-49) + -3.517(Age 50-54) + -4.496(Age 55-59) + -4.522 (Age 60-64) + -5.578(Age 65-69) + -6.025(Age 70-74) + -6.820(age 80+) + -1.637 (Income 15k-<25k) + -2.076(Income 25k-<35k) + -2.547(Income 35k-<50k) + -3.050(Income 50k-<75k) + -3.500(Income 75k-<200k) + -3.784(Income 200k+) + -0.053(Education Graduate degree) + -1.711(High school diploma) + -1.791(Less than HS) + -0.714(Some college) + -1.587(Current smoker) + -1.990(Former smoker) + -2.937(smoking never) \]

Part 1 Step 3:

Physhlth_days (Estimate = 0.266) Holding all other variables constant, each additional physically unhealthy days is associated with 0.266 more mentally unhealthy days on average.

Bmi (Estimate = 0.033) For each one-unit increase in BMI, mentally unhealthy days increased by 0.033 days, adjusting for all predictors.

Sex (Estimate = 1.390) Females report 1.39 more mentally unhealthy days than males, holding all other variables constant.

Exercise (Estimate = -0.651) Individulas who report exercising have 0.65 fewer mentally unhealthy days per month compared to those who do not exercise, controlling all other variables.

Age_group 45-49 (Estimate= -3.056) Adults ages 45-49 report 3.056 fewer mentally unhealthy days per month compared to adults aged 18-24.

Income 35-<50k (Estimate = -2.547) Compared to adults earning less than 15k, those earning 35k- 49,999 report 2.55 fewer mentally unhealthy days.

Income 75-<200k (Estimate = -3.500) Compared with the lowest income group, adults earning 75k-199,999 report 3.50 fewer mentally unhealthy days, adjusting for all other variables.

Part 1 Step 4:

R-squared: 0.1853 About 18.5% of the variation in mentlaly unhealthy days is explained by the predictors in the model.

Adjusted R-squared: 0.1824 The adjusted r-squared is slightly lower because it penalizes the model for including many predictors. This provides a more conservative estimate of explained variance.

Root MSE:7.352 On average, the model’s predictions differ from observed mentally unhealthy days by about 7.35 days.

Overall F-test: H0: All regression coefficients = 0 F-statistic = 62.52 with 7970 degrees of freedom p-value: 2.2e-16 Conclusion: We reject the null hypothesis. The predictors explain significantly more variation in mentally unhealthy days than a model with no predictors.

#Type III sums of squares
anova_type3 <- Anova(brfss_mlr, type = "III")

#Displaying output
anova_type3 %>%
  as.data.frame() %>%
  rownames_to_column("Source") %>%
  mutate(across(where(is.numeric), ~ round(., 4))) %>%
  kable(
    caption = "Table 2: Type III Sums of Squares",
    col.names = c("Source", "Sum of Sq", "df", "F value", "p-value")
  ) %>%
    kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Table 2: Type III Sums of Squares
Source Sum of Sq df F value p-value
(Intercept) 13268.5959 1 245.5036 0.0000
physhlth_days 37622.7952 1 696.1198 0.0000
bmi 345.2408 1 6.3879 0.0115
sex 3723.8662 1 68.9012 0.0000
exercise 497.0434 1 9.1966 0.0024
age_group 30092.1774 12 46.3986 0.0000
income 4733.8943 6 14.5982 0.0000
education 1265.1504 4 5.8521 0.0001
smoking 5101.1076 3 31.4613 0.0000
Residuals 430750.0872 7970 NA NA

Based on the Type III sum of squares, all predictors in the model show statistically significant partial associations with mentally unhealthy days.

#
#Reduced model (no income)
reduced_mod <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + education + smoking, data = brfss_analytic)

#Compare reduced vs full
anova(brfss_mlr, reduced_mod) %>%
  as.data.frame() %>%
  rownames_to_column ("Model") %>%
  mutate(
    Model = c("Full (with income)", "Reduced (no income)"),
    across(where(is.numeric), ~round(., 4))
  ) %>%
  kable(
    caption = "Table 3: Does income add to the model?",
    col.names = c("Model", "Res.df", "RSS", "df", "Sum of Sq", "F", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Table 3: Does income add to the model?
Model Res.df RSS df Sum of Sq F p-value
Full (with income) 7970 430750.1 NA NA NA NA
Reduced (no income) 7976 435484.0 -6 -4733.894 14.5982 0

H0: All income coefficients = 0 Ha: At least one income coefficient != 0 F-statistic = 14.5982 Degrees of freedom = 6(from income categories) 7970 (full model df) p-value = <.001

Conclusion: We reject the null hypothesis. Income improves the model after accounting for all other predictors.

#Reduced model (no education)
no_education_model <- lm(menthlth_days ~physhlth_days + bmi + sex + exercise + age_group + income + smoking, data = brfss_analytic)

#Compare no education reduced model to full model
anova(brfss_mlr, no_education_model) %>%
  as.data.frame() %>%
  rownames_to_column ("Model") %>%
  mutate(
    Model = c("Full (with education)", "Reduced (no education)"),
    across(where(is.numeric), ~round(., 4))
  ) %>%
  kable(
    caption = "Table 4: Does education add to the model?",
    col.names = c("Model", "Res.df", "RSS", "df", "Sum of Sq", "F", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Table 4: Does education add to the model?
Model Res.df RSS df Sum of Sq F p-value
Full (with education) 7970 430750.1 NA NA NA NA
Reduced (no education) 7974 432015.2 -4 -1265.15 5.8521 1e-04

H0: All education coefficients = 0 Ha: At least one education coefficient != 0

F-statistic = 5.8521 Degrees of Freedom: 4 (from education), 7970 (full model) p-value: .0001

Conclusion: we reject the null hypothesis. Education improves the model after accounting for all other predictors.

Part 2 Step 4:

The Type III results showed that all predictors in the model have statistically significant associations with mentally unhealthy days after adjusting for the other variables. The chunk tests illustrated income and education both improve the model. Unlike individual t-test, chunk tests assess whether an entire categorical predictor adds explanatory power collectively.

#binary smoking variable
brfss_analytic <- brfss_analytic %>%
  mutate(
    smoker_current = case_when(
      smoking %in% c("Current daily smoker", "Current some-day smoker") ~ "Current smoker",
      TRUE ~ "Non-smoker"
    ),
    smoker_current = factor(smoker_current, levels = c("Non-smoker", "Current smoker"))
  )
#Model A main effects
model_A <- lm(menthlth_days ~ physhlth_days + bmi + sex + smoker_current + exercise + age_group + income + education, data = brfss_analytic)

#Model B with interaction
model_B <- lm(menthlth_days ~ physhlth_days + bmi + sex * smoker_current + exercise + age_group + income + education, data= brfss_analytic)
#comparing two models
anova(model_A, model_B) |>
  tidy() |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(caption = "Comparing the models") |>
    kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Comparing the models
term df.residual rss df sumsq statistic p.value
menthlth_days ~ physhlth_days + bmi + sex + smoker_current + exercise + age_group + income + education 7972 432508.9 NA NA NA NA
menthlth_days ~ physhlth_days + bmi + sex * smoker_current + exercise + age_group + income + education 7971 432174.9 1 333.9749 6.1598 0.0131

Hypotheses H0: The interaction between sex and current smoking = 0 Ha: the interaction != 0

Test results: F-statistic: 6.1598 Degrees of freedom: 7971 p-value: 0.0131

Conclusion: Based on the test results, we reject the null hypothesis. There is statistically significant evidence that the association between current smoking and mentally unhealthy days differs between men and women.

summary(model_B)
## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex * smoker_current + 
##     exercise + age_group + income + education, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.337  -3.837  -1.604   0.628  30.426 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                8.71726    0.69624  12.520  < 2e-16
## physhlth_days                              0.26859    0.01007  26.679  < 2e-16
## bmi                                        0.03309    0.01323   2.502 0.012380
## sexFemale                                  1.18553    0.17752   6.678 2.58e-11
## smoker_currentCurrent smoker               1.52079    0.36539   4.162 3.19e-05
## exerciseYes                               -0.67285    0.21496  -3.130 0.001753
## age_group25-29                            -0.92018    0.51962  -1.771 0.076616
## age_group30-34                            -0.89242    0.50591  -1.764 0.077774
## age_group35-39                            -1.59287    0.48752  -3.267 0.001090
## age_group40-44                            -2.62864    0.48564  -5.413 6.39e-08
## age_group45-49                            -2.84255    0.49687  -5.721 1.10e-08
## age_group50-54                            -3.27784    0.48195  -6.801 1.11e-11
## age_group55-59                            -4.24987    0.47404  -8.965  < 2e-16
## age_group60-64                            -4.26404    0.45671  -9.336  < 2e-16
## age_group65-69                            -5.25062    0.44662 -11.756  < 2e-16
## age_group70-74                            -5.71106    0.45439 -12.569  < 2e-16
## age_group75-79                            -5.90758    0.47018 -12.565  < 2e-16
## age_group80+                              -6.49946    0.47359 -13.724  < 2e-16
## income$15,000–$24,999                     -1.63574    0.46999  -3.480 0.000503
## income$25,000–$34,999                     -2.07457    0.44862  -4.624 3.82e-06
## income$35,000–$49,999                     -2.54551    0.43915  -5.796 7.03e-09
## income$50,000–$74,999                     -3.04298    0.41157  -7.394 1.57e-13
## income$75,000–$199,999                    -3.50972    0.42999  -8.162 3.79e-16
## income$200,000 or more                    -3.84047    0.50103  -7.665 2.00e-14
## educationGraduate or professional degree  -0.14883    0.21054  -0.707 0.479654
## educationHigh school diploma or GED       -1.69232    0.50046  -3.382 0.000724
## educationLess than high school            -1.81788    0.69283  -2.624 0.008711
## educationSome college or technical school -0.69999    0.23849  -2.935 0.003343
## sexFemale:smoker_currentCurrent smoker     1.28327    0.51705   2.482 0.013089
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexFemale                                 ***
## smoker_currentCurrent smoker              ***
## exerciseYes                               ** 
## age_group25-29                            .  
## age_group30-34                            .  
## age_group35-39                            ** 
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000–$24,999                     ***
## income$25,000–$34,999                     ***
## income$35,000–$49,999                     ***
## income$50,000–$74,999                     ***
## income$75,000–$199,999                    ***
## income$200,000 or more                    ***
## educationGraduate or professional degree     
## educationHigh school diploma or GED       ***
## educationLess than high school            ** 
## educationSome college or technical school ** 
## sexFemale:smoker_currentCurrent smoker    *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.363 on 7971 degrees of freedom
## Multiple R-squared:  0.1826, Adjusted R-squared:  0.1798 
## F-statistic: 63.61 on 28 and 7971 DF,  p-value: < 2.2e-16

The coefficient for smoker_current is 1.52079. Among men, current smokers report 1.52 more mentally unhealthy days per month than non-smokers, adjusting for all other predictors.

The coefficient of the interaction term is 1.28327. Adding this coefficient to the main smoking effect gives an estimated effect of 2.80406 for women. This indicates that among women, current smokers report about 2.80 more mentally unhealthy days per month than non-smokers, after adjusting for other predictors.

Smoking is more strongly associated with poor mental health among women than among men. Current smoking is linked to 1.52 additional mentally unhealthy days per month for men, but 2.80 additional days for women. This indicates the association is about 1.28 days stronger in women.

#predicted values
pred <- ggpredict(model_B, terms = c("smoker_current", "sex"))

#
ggplot(pred, aes(x = x, y = predicted, color = group, fill = group)) +
  geom_line(aes(group = group),
            linewidth = 1.1,
            position = position_dodge(width = 0.2)) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high, group = group),
              alpha = 0.1,
              color = NA,
              position = position_dodge(width = 0.2)) +
  labs(
    title = "Predicted Mentally Unhealthy Days by Smoking Status and Sex",
    x = "Smoking Status",
    y = "Predicted Mentally Unhealthy Days (past 30 days)",
    color = "Sex",
    fill = "Sex"
  ) +
  theme_minimal(base_size = 13)

Smoking is linked to more days of poor mental health for both men and women, but the increase is noticeably larger for women. Women who smoke experience a sharper rise in mentally unhealthy days compared to women who do not smoke. This pattern suggests that smoking may place a disproportionate mental health burden on women. This findings highlight the importance of tailoring smoking-related mental health interventions to address the specific needs and experiences of women.

Part 4: Reflection

The results suggest that multiple social, behavioral, and health-related factors shape the mental health of US adults. Physical health has one of the strongest associations, indicating people experiencing more days of poor physical health also tend to report more mentally unhealthy days. Smoking and sex were also meaningfully related to mental health where current smokers reported worse outcomes. A limitation of this analysis is the use of cross-sectional survey data, which prevents us from determining whether these predictors cause changes in mental health or simply occur alongside them.

The distinction between R-squared and adjusted R-squared is especially important in multiple regression. While R-squared always increases as more predictors are added, adjusted R-squared penalized the model for including variables that do not meaningfully improve the prediction. This is relevant when adding categorical predictors with many levels, because each level introduces additional parameters that can inflate R-squared without improving the fit of the model. Using chunk tests to test groups of predictors collectively provides a more rigorous assessment of whether an entire predictor like income or education adds explanatory value beyond the other variables in the model. Relying only on individual t-tests for each category can be misleading, because even if no single level is statistically significant on its own, the group of predictors as a whole may still contribute important variation.

I used AI assistance to explain error messages when creating visualizations. I received a few errors earlier in the assignment, where I ask AI to explain the error and potential ways to fix them.