knitr::opts_chunk$set(echo = TRUE)

Part 0: Data Acquisition and Preparation

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.2
## Warning: package 'stringr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(haven)
## Warning: package 'haven' was built under R version 4.5.2
library(janitor)
## Warning: package 'janitor' was built under R version 4.5.2
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(knitr)
## Warning: package 'knitr' was built under R version 4.5.2
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.5.2
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(plotly)
## Warning: package 'plotly' was built under R version 4.5.2
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(broom)
## Warning: package 'broom' was built under R version 4.5.2
library(ggeffects)
## Warning: package 'ggeffects' was built under R version 4.5.2
library(ggstats)
## Warning: package 'ggstats' was built under R version 4.5.2
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.5.2
library(GGally)
## Warning: package 'GGally' was built under R version 4.5.2
library(car)
## Warning: package 'car' was built under R version 4.5.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.2
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.5.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.5.2
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
library(dplyr)
brfss_full <- read_xpt(
  "C:/Users/abbym/OneDrive/Desktop/STATS553/Assignments/Homework #3/LLCP2023XPT/LLCP2023.XPT"
) |>
  clean_names()
brfss_clean <- brfss_full |>

# Recode variables
  mutate(
    # Outcome: mentally unhealthy days in past 30 (88 = none = 0; 77/99 = DK/refused = NA)
    menthlth_days = case_when(
      menthlth == 88              ~ 0,
      menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
      TRUE                        ~ NA_real_
    ),
    # Physical health days (key predictor)
    physhlth_days = case_when(
      physhlth == 88              ~ 0,
      physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
      TRUE                        ~ NA_real_
    ),
    # Age category
    age_group = factor(case_when(
      ageg5yr == 1 ~ "18-24",
      ageg5yr == 2 ~ "25-29",
      ageg5yr == 3 ~ "30-34",
      ageg5yr == 4 ~ "35-39",
      ageg5yr == 5 ~ "40-44",
      ageg5yr == 6 ~ "45-49",
      ageg5yr == 7 ~ "50-54",
      ageg5yr == 8 ~ "55-59",
      ageg5yr == 9 ~ "60-64",
      ageg5yr == 10 ~ "65-69",
      ageg5yr == 11 ~ "70-74",
      ageg5yr == 12 ~ "75-79",
      ageg5yr == 13 ~ "80+",
      ageg5yr == 14 ~ NA
    )), 
    # Income
    income = factor(case_when(
      incomg1 == 1 ~ "Less than $15,000",
      incomg1 == 2 ~ "$15,000 to <$25,000",
      incomg1 == 3 ~ "$25,000 to <$35,000",
      incomg1 == 4 ~ "$35,000 to <$50,000",
      incomg1 == 5 ~ "$50,000 to <$100,000",
      incomg1 == 6 ~ "$100,000 to <$200,000",
      incomg1 == 7 ~ "$200,000+",
      incomg1 == 9 ~ NA
    )), 
    # Education
    education = factor(case_when(
      educa == 1 ~ "Less than high school", 
      educa == 2 ~ "Less than high school",
      educa == 3 ~ "High school diploma or GED",
      educa == 4 ~ "Some college or technical school",
      educa == 5 ~ "College graduate",
      educa == 6 ~ "Graduate or professional degree",
      educa == 9 ~ NA
    )), 
    # Smoker
    smoking = factor(case_when(
      smoker3 == 1 ~ "Current daily smoker",
      smoker3 == 2 ~ "Current some-day smoker",
      smoker3 == 3 ~ "Former smoker",
      smoker3 == 4 ~ "Never smoker",
      smoker3 == 9 ~ NA
    )),
    # Sex
    sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
    # Exercise in past 30 days (any physical activity)
    exercise = factor(case_when(
      exerany2 == 1 ~ "Yes",
      exerany2 == 2 ~ "No",
      TRUE          ~ NA_character_
    ), levels = c("No", "Yes")),
    # BMI (stored as integer × 100 in BRFSS)
    bmi = ifelse(bmi5 > 0, bmi5 / 100, NA_real_)
  ) |>
  
  # Select Variables
 dplyr::select(physhlth_days, menthlth_days, bmi, sex, exercise, age_group, income, education, smoking)
# ── Analytic sample (reproducible random sample) ────────────────────────────
set.seed(1220)
brfss_analytic <- brfss_clean |>
 drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
 age_group, income, education, smoking) |>
 slice_sample(n = 8000)

# Save for lab activity
saveRDS(brfss_analytic,
 "C:/Users/abbym/OneDrive/Desktop/STATS553/Assignments/Homework #3/brfss_analytic.rds")

tibble(Metric = c("Observations", "Variables"),
       Value  = c(nrow(brfss_analytic), ncol(brfss_analytic))) |>
  kable(caption = "Analytic Dataset Dimensions") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Analytic Dataset Dimensions
Metric Value
Observations 8000
Variables 9
brfss_analytic |>
  tbl_summary(
    by = sex,
    label = list(
      menthlth_days ~ "Mentally unhealthy days (past 30)",
      physhlth_days ~ "Physically unhealthy days (past 30)",
      bmi     ~ "BMI (kg/m^2)",
      age_group           ~ "Age Group (years)",
      income      ~ "Household income",
      exercise      ~ "Any physical activity (past 30 days)",
      education ~ "Education Level",
      smoking ~ "Smoking status"
    ),
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1,
    missing = "no"
  ) |>
  add_n() |>
  # add_overall() |>
  bold_labels() |>
  #italicize_levels() |>
  modify_caption("**Table 1. Descriptive Statistics — BRFSS 2023 Analytic Sample (n = 8,000)**")
Table 1. Descriptive Statistics — BRFSS 2023 Analytic Sample (n = 8,000)
Characteristic N Male
N = 3,936
1
Female
N = 4,064
1
Physically unhealthy days (past 30) 8,000 4.0 (8.4) 4.9 (8.9)
Mentally unhealthy days (past 30) 8,000 3.5 (7.5) 5.1 (8.6)
BMI (kg/m^2) 8,000 28.7 (6.0) 28.7 (7.0)
Any physical activity (past 30 days) 8,000 3,146 (80%) 3,094 (76%)
Age Group (years) 8,000

    18-24
235 (6.0%) 171 (4.2%)
    25-29
219 (5.6%) 189 (4.7%)
    30-34
253 (6.4%) 210 (5.2%)
    35-39
263 (6.7%) 302 (7.4%)
    40-44
290 (7.4%) 292 (7.2%)
    45-49
266 (6.8%) 252 (6.2%)
    50-54
305 (7.7%) 303 (7.5%)
    55-59
308 (7.8%) 352 (8.7%)
    60-64
408 (10%) 379 (9.3%)
    65-69
418 (11%) 483 (12%)
    70-74
382 (9.7%) 426 (10%)
    75-79
325 (8.3%) 338 (8.3%)
    80+
264 (6.7%) 367 (9.0%)
Household income 8,000

    $100,000 to <$200,000
996 (25%) 869 (21%)
    $15,000 to <$25,000
271 (6.9%) 370 (9.1%)
    $200,000+
400 (10%) 238 (5.9%)
    $25,000 to <$35,000
376 (9.6%) 495 (12%)
    $35,000 to <$50,000
482 (12%) 585 (14%)
    $50,000 to <$100,000
1,251 (32%) 1,260 (31%)
    Less than $15,000
160 (4.1%) 247 (6.1%)
Education Level 8,000

    College graduate
1,018 (26%) 1,120 (28%)
    Graduate or professional degree
1,763 (45%) 1,896 (47%)
    High school diploma or GED
130 (3.3%) 122 (3.0%)
    Less than high school
75 (1.9%) 49 (1.2%)
    Some college or technical school
950 (24%) 877 (22%)
Smoking status 8,000

    Current daily smoker
339 (8.6%) 319 (7.8%)
    Current some-day smoker
151 (3.8%) 117 (2.9%)
    Former smoker
1,207 (31%) 1,055 (26%)
    Never smoker
2,239 (57%) 2,573 (63%)
1 Mean (SD); n (%)

Part 1: Multiple Linear Regression

 # Fitting the model
m1 <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + education + smoking, data = brfss_analytic)

summary(m1)
## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise + 
##     age_group + income + education + smoking, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.080  -3.865  -1.597   0.712  30.471 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                7.94160    0.67128  11.830  < 2e-16
## physhlth_days                              0.26558    0.01007  26.384  < 2e-16
## bmi                                        0.03338    0.01321   2.527 0.011510
## sexFemale                                  1.39038    0.16750   8.301  < 2e-16
## exerciseYes                               -0.65116    0.21472  -3.033 0.002432
## age_group25-29                            -1.05660    0.51938  -2.034 0.041950
## age_group30-34                            -1.09395    0.50646  -2.160 0.030803
## age_group35-39                            -1.81103    0.48851  -3.707 0.000211
## age_group40-44                            -2.89307    0.48749  -5.935 3.07e-09
## age_group45-49                            -3.05618    0.49769  -6.141 8.61e-10
## age_group50-54                            -3.51674    0.48323  -7.277 3.72e-13
## age_group55-59                            -4.49597    0.47555  -9.454  < 2e-16
## age_group60-64                            -4.52215    0.45848  -9.863  < 2e-16
## age_group65-69                            -5.57795    0.45019 -12.390  < 2e-16
## age_group70-74                            -6.02536    0.45741 -13.173  < 2e-16
## age_group75-79                            -6.28656    0.47484 -13.239  < 2e-16
## age_group80+                              -6.81968    0.47684 -14.302  < 2e-16
## income$15,000 to <$25,000                  1.86281    0.36432   5.113 3.24e-07
## income$200,000+                           -0.28425    0.34023  -0.835 0.403477
## income$25,000 to <$35,000                  1.42347    0.32097   4.435 9.33e-06
## income$35,000 to <$50,000                  0.95299    0.29630   3.216 0.001304
## income$50,000 to <$100,000                 0.44936    0.23068   1.948 0.051456
## incomeLess than $15,000                    3.49984    0.42923   8.154 4.07e-16
## educationGraduate or professional degree  -0.05310    0.21116  -0.251 0.801448
## educationHigh school diploma or GED       -1.71101    0.49969  -3.424 0.000620
## educationLess than high school            -1.79091    0.69119  -2.591 0.009585
## educationSome college or technical school -0.71412    0.23816  -2.998 0.002722
## smokingCurrent some-day smoker            -1.58670    0.53523  -2.965 0.003041
## smokingFormer smoker                      -1.98971    0.33713  -5.902 3.74e-09
## smokingNever smoker                       -2.93681    0.32162  -9.131  < 2e-16
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexFemale                                 ***
## exerciseYes                               ** 
## age_group25-29                            *  
## age_group30-34                            *  
## age_group35-39                            ***
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000 to <$25,000                 ***
## income$200,000+                              
## income$25,000 to <$35,000                 ***
## income$35,000 to <$50,000                 ** 
## income$50,000 to <$100,000                .  
## incomeLess than $15,000                   ***
## educationGraduate or professional degree     
## educationHigh school diploma or GED       ***
## educationLess than high school            ** 
## educationSome college or technical school ** 
## smokingCurrent some-day smoker            ** 
## smokingFormer smoker                      ***
## smokingNever smoker                       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared:  0.1853, Adjusted R-squared:  0.1824 
## F-statistic: 62.52 on 29 and 7970 DF,  p-value: < 2.2e-16

Fitted Equation: menthlth_days = 9.804 + 0.266(physhlth_days) + 0.033(bmi) + 1.390(sexFemale) - 0.651(exerciseYes) - 1.057(age_group25-29) - 1.094(age_group30-34) - 1.811(age_group35-39) - 2.893(age_group40-44) - 3.056(age_group45-49) - 3.517(age_group50-54) - 4.496(age_group55-59) - 4.522(age_group60-64) - 5.578(age_group65-69) - 6.025(age_group70-74) - 6.287(age_group75-79) - 6.820(age_group80+) - 2.147(income$200,000+) - 0.439(income$25,000 to <$35,000) - 0.910(income$35,000 to <$50,000) - 1.413(income$50,000 to <$100,000) - 1.863(income100,000 to <$200,000) - 1.637(incomeLess than $15,000) - 0.053(educationGraduate or professional degree) - 1.711(educationHigh school diploma or GED) - 1.791(educationLess than high school) - 0.714(educationSome college or technical school) - 1.587(smokingCurrent some-day smoker) - 1.990(smokingFormer smoker) - 2.937(smokingNever smoker)

Interpretation: For everyone one unit increase in the number of physically unhealthy days, there is an increase of 0.266 mentally unhealthy days, holding all other variables constant. For every one unit increase in BMI, there is an increase of 0.033 mentally unhealthy days, holding all other variables constant. Compared to males, females are expected to have 1.390 more mentally unhealthy days, holding all other variables constant. Compared to those who do not exercise, those who do exercise experience 0.651 less mentally unhealthy days, holding all other variables constant. Compared to 18-24 year olds, 30-34 year olds experience 1.094 fewer mentally unhealthy days, holding all other variables constant. Compared to those who make less than $15,000, those who make $200,000 or more have 2.147 fewer mentally unhealthy days, and those who make $25,000-<$35,000 have 0.439 fewer mentally unhealthy days, holding all other variables constant.

glance(m1) |>
  dplyr::select(r.squared, adj.r.squared, sigma, statistic, p.value, df, df.residual, nobs) |>
  mutate(across(where(is.numeric), ~ round(., 4))) |>
  pivot_longer(everything(), names_to = "Statistic", values_to = "Value") |>
  mutate(Statistic = dplyr::recode(Statistic,
    "r.squared"     = "R²",
    "adj.r.squared" = "Adjusted R²",
    "sigma"         = "Residual Std. Error (Root MSE)",
    "statistic"     = "F-statistic",
    "p.value"       = "p-value (overall F-test)",
    "df"            = "Model df (p)",
    "df.residual"   = "Residual df (n − p − 1)",
    "nobs"          = "n (observations)"
  )) %>%
  kable(caption = "Table 2. Overall Model Summary — Model 1") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Table 2. Overall Model Summary — Model 1
Statistic Value
0.1853
Adjusted R² 0.1824
Residual Std. Error (Root MSE) 7.3516
F-statistic 62.5234
p-value (overall F-test) 0.0000
Model df (p) 29.0000
Residual df (n − p − 1) 7970.0000
n (observations) 8000.0000
# MSE
rmse_m1 <- round(summary(m1)$sigma, 2)
cat("Root MSE (Model 1):", rmse_m1, "mentally unhealthy days\n")
## Root MSE (Model 1): 7.35 mentally unhealthy days

Interpretation: 18.53% of all of the variance in mentally unhealthy days can be explained by all of the predictors in the model. The adjusted r-squared states that 18.24% of all the variance in the mentally unhealthy days can be explained by all of the predictors in the model, but adjusted r-squared takes into consideration the number of predictors, and can decrease if a useless predictor is included. The MSE is 7.35 mentally unhealthy days, meaning the model prediction is typically off by 7.35 mentally unhealthy days, on average. The null hypothesis for the overall F-test is that none of the predictors matter or add to the model. The F-statistic is 62.5234. The numerator degrees of freedom are 29 and the denominator degrees of freedom are 7970. The p-value is 0, meaning the predictors do matter in the model and help to predict the number of mentally unhealthy days.

Part 2: Tests of Hypotheses

# Type III Test
anova_type3 <- Anova(m1, type = "III")

# Display Results
anova_type3 %>%
  as.data.frame() %>%
  rownames_to_column("Source") %>%
  mutate(across(where(is.numeric), ~ round(., 4))) %>%
  kable(
    caption = "Table 3. Type III (Partial) Sums of Squares — car::Anova()",
    col.names = c("Source", "Sum of Sq", "df", "F value", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  footnote(general = "Each F-test uses the MSE from the full model. Each variable is tested given ALL other variables.")
Table 3. Type III (Partial) Sums of Squares — car::Anova()
Source Sum of Sq df F value p-value
(Intercept) 7564.3734 1 139.9606 0.0000
physhlth_days 37622.7952 1 696.1198 0.0000
bmi 345.2408 1 6.3879 0.0115
sex 3723.8662 1 68.9012 0.0000
exercise 497.0434 1 9.1966 0.0024
age_group 30092.1774 12 46.3986 0.0000
income 4733.8943 6 14.5982 0.0000
education 1265.1504 4 5.8521 0.0001
smoking 5101.1076 3 31.4613 0.0000
Residuals 430750.0872 7970 NA NA
Note:
Each F-test uses the MSE from the full model. Each variable is tested given ALL other variables.

Interpretation: Physically unhealthy days, BMI, sex, exercise, age group, income, education, and smoking all had statistically significant partial associations.

m2 <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + education + smoking, data = brfss_analytic)

# Compare using anova() — this performs the partial F-test
anova(m2, m1) %>%
  as.data.frame() %>%
  rownames_to_column("Model") %>%
  mutate(
    Model = c("Reduced (no income)", "Full (with income)"),
    across(where(is.numeric), ~ round(., 4))
  ) %>%
  kable(
    caption = "Table 4. Partial F-Test: Does income add to the model?",
    col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Table 4. Partial F-Test: Does income add to the model?
Model Res. df RSS df Sum of Sq F p-value
Reduced (no income) 7976 435484.0 NA NA NA NA
Full (with income) 7970 430750.1 6 4733.894 14.5982 0

Interpretation: The null hypothesis for the partial F-Test is that income does not add to the model. The alternative hypothesis is that income does add to the model. The F-statistic is 14.5982, the degrees of freedom are 6, and the p-value is 0. Since the p-value is less than 0.05, you reject the null hypothesis that income does not add to the model. This means that income, as a predictor, adds valuable information to the model to predict mentally unhealthy days, given that all the other predictors are already in the model..

m3 <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + smoking, data = brfss_analytic)

# Compare using anova() — this performs the partial F-test
anova(m3, m1) %>%
  as.data.frame() %>%
  rownames_to_column("Model") %>%
  mutate(
    Model = c("Reduced (no education)", "Full (with education)"),
    across(where(is.numeric), ~ round(., 4))
  ) %>%
  kable(
    caption = "Table 5. Partial F-Test: Does education add to the model?",
    col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Table 5. Partial F-Test: Does education add to the model?
Model Res. df RSS df Sum of Sq F p-value
Reduced (no education) 7974 432015.2 NA NA NA NA
Full (with education) 7970 430750.1 4 1265.15 5.8521 1e-04

Interpretation: The null hypothesis for the partial F-Test is that education does not add to the model. The alternative hypothesis is that education does add to the model. The F-statistic is 5.8521, the degrees of freedom are 4, and the p-value is 1e-04. Since the p-value is less than 0.05, you reject the null hypothesis that education does not add to the model. This means that education, as a predictor, adds valuable information to the model to predict mentally unhealthy days, given that all the other predictors are already in the model..

Final Interpretation: The type III test helps to determine which predictors have the strongest contributions to the model. Physically unhealthy days and age group contribute the most to the model, independently. The chunk tests helps to give magnitude to how much the predictor adds to the model. In addition, it helps you understand if it adds to the model, given that other variables are already present, rather than a regular t-test, which just tells you if the predictor has an association with the outcome.

Part 3: Interaction Analysis

brfss_analytic <- brfss_analytic |>
  mutate( 
    smoker_current = factor(case_when(
      smoking == "Current daily smoker" ~ "Current smoker",
      smoking == "Current some-day smoker" ~ "Current smoker",
      smoking == "Former smoker" ~ "Non-smoker",
      smoking == "Never smoker" ~ "Non-smoker") 
    ))

# Change reference group to "Non-smoker"
brfss_analytic$smoker_current_reref <- relevel(brfss_analytic$smoker_current, ref = "Non-smoker")
# Fit model A

model_a <- lm(menthlth_days ~ physhlth_days + bmi + sex + smoker_current_reref + exercise + age_group + income+ education, data = brfss_analytic)

tidy(model_a, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Model A: No Interaction",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Model A: No Interaction
Term Estimate SE t p-value CI Lower CI Upper
(Intercept) 5.1228 0.6105 8.3907 0.0000 3.9260 6.3197
physhlth_days 0.2686 0.0101 26.6731 0.0000 0.2489 0.2884
bmi 0.0334 0.0132 2.5250 0.0116 0.0075 0.0593
sexFemale 1.3331 0.1673 7.9675 0.0000 1.0051 1.6611
smoker_current_rerefCurrent smoker 2.1287 0.2712 7.8489 0.0000 1.5971 2.6604
exerciseYes -0.6725 0.2150 -3.1274 0.0018 -1.0940 -0.2510
age_group25-29 -0.9149 0.5198 -1.7602 0.0784 -1.9338 0.1040
age_group30-34 -0.8823 0.5061 -1.7434 0.0813 -1.8743 0.1097
age_group35-39 -1.5810 0.4877 -3.2421 0.0012 -2.5369 -0.6251
age_group40-44 -2.6157 0.4858 -5.3847 0.0000 -3.5680 -1.6635
age_group45-49 -2.8246 0.4970 -5.6836 0.0000 -3.7988 -1.8504
age_group50-54 -3.2600 0.4821 -6.7628 0.0000 -4.2050 -2.3151
age_group55-59 -4.2301 0.4741 -8.9219 0.0000 -5.1595 -3.3007
age_group60-64 -4.2484 0.4568 -9.3000 0.0000 -5.1439 -3.3529
age_group65-69 -5.2338 0.4467 -11.7163 0.0000 -6.1095 -4.3582
age_group70-74 -5.7023 0.4545 -12.5457 0.0000 -6.5933 -4.8113
age_group75-79 -5.8977 0.4703 -12.5401 0.0000 -6.8197 -4.9758
age_group80+ -6.4888 0.4737 -13.6975 0.0000 -7.4174 -5.5602
income$15,000 to <$25,000 1.8562 0.3650 5.0855 0.0000 1.1407 2.5717
income$200,000+ -0.3265 0.3408 -0.9581 0.3380 -0.9946 0.3415
income$25,000 to <$35,000 1.4337 0.3214 4.4605 0.0000 0.8036 2.0637
income$35,000 to <$50,000 0.9491 0.2969 3.1971 0.0014 0.3672 1.5310
income$50,000 to <$100,000 0.4537 0.2311 1.9632 0.0497 0.0007 0.9067
incomeLess than $15,000 3.5360 0.4300 8.2232 0.0000 2.6931 4.3789
educationGraduate or professional degree -0.1579 0.2106 -0.7496 0.4535 -0.5706 0.2549
educationHigh school diploma or GED -1.6896 0.5006 -3.3750 0.0007 -2.6710 -0.7083
educationLess than high school -1.9035 0.6922 -2.7499 0.0060 -3.2604 -0.5466
educationSome college or technical school -0.7070 0.2385 -2.9636 0.0030 -1.1746 -0.2393
# Fit model B

model_b <- lm(menthlth_days ~ physhlth_days + bmi + sex * smoker_current_reref + exercise + age_group + income+ education, data = brfss_analytic)

tidy(model_b, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Model B: Interaction between Sex and Smoking Status",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Model B: Interaction between Sex and Smoking Status
Term Estimate SE t p-value CI Lower CI Upper
(Intercept) 5.2075 0.6113 8.5189 0.0000 4.0092 6.4058
physhlth_days 0.2686 0.0101 26.6788 0.0000 0.2489 0.2883
bmi 0.0331 0.0132 2.5017 0.0124 0.0072 0.0590
sexFemale 1.1855 0.1775 6.6784 0.0000 0.8376 1.5335
smoker_current_rerefCurrent smoker 1.5208 0.3654 4.1621 0.0000 0.8045 2.2371
exerciseYes -0.6728 0.2150 -3.1301 0.0018 -1.0942 -0.2515
age_group25-29 -0.9202 0.5196 -1.7709 0.0766 -1.9388 0.0984
age_group30-34 -0.8924 0.5059 -1.7640 0.0778 -1.8841 0.0993
age_group35-39 -1.5929 0.4875 -3.2673 0.0011 -2.5485 -0.6372
age_group40-44 -2.6286 0.4856 -5.4127 0.0000 -3.5806 -1.6766
age_group45-49 -2.8425 0.4969 -5.7209 0.0000 -3.8165 -1.8686
age_group50-54 -3.2778 0.4820 -6.8012 0.0000 -4.2226 -2.3331
age_group55-59 -4.2499 0.4740 -8.9652 0.0000 -5.1791 -3.3206
age_group60-64 -4.2640 0.4567 -9.3364 0.0000 -5.1593 -3.3688
age_group65-69 -5.2506 0.4466 -11.7563 0.0000 -6.1261 -4.3751
age_group70-74 -5.7111 0.4544 -12.5686 0.0000 -6.6018 -4.8203
age_group75-79 -5.9076 0.4702 -12.5646 0.0000 -6.8292 -4.9859
age_group80+ -6.4995 0.4736 -13.7239 0.0000 -7.4278 -5.5711
income$15,000 to <$25,000 1.8740 0.3650 5.1348 0.0000 1.1586 2.5894
income$200,000+ -0.3307 0.3407 -0.9708 0.3317 -0.9986 0.3371
income$25,000 to <$35,000 1.4352 0.3213 4.4666 0.0000 0.8053 2.0650
income$35,000 to <$50,000 0.9642 0.2968 3.2485 0.0012 0.3824 1.5461
income$50,000 to <$100,000 0.4667 0.2311 2.0198 0.0434 0.0138 0.9197
incomeLess than $15,000 3.5097 0.4300 8.1623 0.0000 2.6668 4.3526
educationGraduate or professional degree -0.1488 0.2105 -0.7069 0.4797 -0.5615 0.2639
educationHigh school diploma or GED -1.6923 0.5005 -3.3815 0.0007 -2.6734 -0.7113
educationLess than high school -1.8179 0.6928 -2.6239 0.0087 -3.1760 -0.4598
educationSome college or technical school -0.7000 0.2385 -2.9351 0.0033 -1.1675 -0.2325
sexFemale:smoker_current_rerefCurrent smoker 1.2833 0.5171 2.4819 0.0131 0.2697 2.2968
# Compare using anova() — this performs the partial F-test
anova(model_a, model_b) %>%
  as.data.frame() %>%
  rownames_to_column("Model") %>%
  mutate(
    Model = c("Model A (no interaction)", "Model B (with interaction)"),
    across(where(is.numeric), ~ round(., 4))
  ) %>%
  kable(
    caption = "Table 6. Partial F-Test: Does education add to the model?",
    col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Table 6. Partial F-Test: Does education add to the model?
Model Res. df RSS df Sum of Sq F p-value
Model A (no interaction) 7972 432508.9 NA NA NA NA
Model B (with interaction) 7971 432174.9 1 333.9749 6.1598 0.0131

Interpretation: The null hypothesis is that the interaction term does not add anything to the model, and the alternative hypothesis is that the interaction term between sex and current smoking does add to the model. The F-statistic is 6.1598, the degrees of freedom is 1, and the p-value is 0.0131. Since the p-value is less than 0.05, you reject the null hypothesis that adding the interaction term between sex and current smoking doesn’t add anything to the model. Therefore, adding the interaction between sex and current smoking does add to the model, given that all the other predictors are already in the model. Compared to those who don’t smoke, male current smokers tend to have 1.5208 more mentally unhealthy days. Compared to those who don’t smoke, females that do smoke tend to have 1.2833 more mentally unhealthy days. Based on the model, male smokers tend to have a stronger association with more mentally unhealthy days than women who smoke. This difference, however, is only slightly different; about 0.24 days worth.

pred_int <- ggpredict(model_b, terms = c("smoker_current_reref", "sex"))

plot(pred_int) + labs(
  x= "Smoking Status",
  y= "Mentally Unhealthy Days",
  title= "Predicted Mentally Health Days by Sex and Smoking Status"
)
## Ignoring unknown labels:
## • linetype : "sex"
## • shape : "sex"

Interpretation: The effect of smoking status on the number of mentally unhealthy days varies by sex. Men who are current smokers are more affected by smoking on their mentally unhealthy days than women are. The interaction between sex and smoking status contributes important information when predicting the number of mentally unhealthy days a person faces.

Part 4: Reflection The results of this exploration of different models predicting the number of mentally unhealthy days a person may face in a month shows that this is a complex idea to predict and that there are many factors that play into this prediction. Education, age, exercise and smoking are most strongly associated with mentally unhealthy days. None of the predictors were not statistically significant. The key limitations when it comes to using cross-sectional data is the idea that it is hard to establish causality, as it is hard to establish temporality. Because all of the data was collected at the same time, it is hard to determine which data collected preceded the other data. Two specific confounders that could bias the associations estimated could include the presence of a mental illness in a person, such as depression or anxiety, as well as general health of a person. The adjusted r-squared helps to distinguish which models are the best for predicting an outcome. It is better than r-squared, because it takes into account the number of predictors, and actually penalizes for increasing the number of predictors. This means that it helps to determine which predictors are actually useful for determining an outcome, rather than just adding a bunch of predictors that may or may not be useful. This is especially useful when adding multiple categorical predictors, because each level of a categorical predictor is included as a new predictor in a regression model. It is useful to test groups of predictors collectively with chunk tests rather than just using t-tests, because chunk tests allows you to see the effect of the predictor given all of the variables already in the model, whereas the t-test only tells you whether or not the predictor is associated with the outcome, without considering the other predictors in the model. I did not use any AI for this assignment.