knitr::opts_chunk$set(
  echo    = TRUE,
  warning = FALSE,
  message = FALSE,
  fig.width  = 10,
  fig.height = 6
)

Setup and Data

library(tidyverse)
library(knitr)
library(haven)
library(janitor)
library(kableExtra)
library(broom)
library(gtsummary)
library(car)
library(ggeffects)
library(ResourceSelection)  # for Hosmer-Lemeshow
library(pROC)               # for ROC/AUC
library(performance)        # for model performance metrics
library(sjPlot)
library(modelsummary)
library(leaps)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

brfss_full23 <- read_xpt(
  "C:/Users/graci/OneDrive/Documents/UA GRAD SCHOOL/2nd Semester/EPI553/BRFSS_2023.XPT"
) |>
  clean_names()

dim(brfss_full23)

## [1] 433323    350

Part 0: Data Acquisition and Preparation

These variables were selected based on representation of behavioral, biological, and mental health factors, along with sociodemographic characteristics. These variables are well established determines of physicals health and together capture multiple dimensions that may influence physically unhealthy days.

brfss_var <- brfss_full23 |>
  mutate(
    # Outcome
    physhlth_days = case_when(
      physhlth == 88 ~ 0,
      physhlth %in% c(77, 99) ~ NA_real_,
      physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
      TRUE ~ NA_real_
    ),

    # Continuous
    menthlth_days = case_when(
      menthlth == 88 ~ 0,
      menthlth %in% c(77, 99) ~ NA_real_,
      menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
      TRUE ~ NA_real_
    ),

    bmi = case_when(
      `bmi5` == 9999 ~ NA_real_,
      TRUE ~ `bmi5` / 100
    ),

    # Binary
    sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),

    exercise = case_when(
      exerany2 == 1 ~ "Yes",
      exerany2 == 2 ~ "No",
      exerany2 %in% c(7, 9) ~ NA_character_
    ) |> factor(levels = c("No", "Yes")),

    # Categorical 
    age_group = case_when(
      `ageg5yr` %in% 1:3 ~ "18-34",
      `ageg5yr` %in% 4:6 ~ "35-49",
      `ageg5yr` %in% 7:9 ~ "50-64",
      `ageg5yr` %in% 10:13 ~ "65+",
      TRUE ~ NA_character_
    ) |> factor(levels = c("18-34", "35-49", "50-64", "65+")),

    income = case_when(
      `incomg1` %in% 1:2 ~ "Less than $25K",
      `incomg1` %in% 3:4 ~ "$25K-$49K",
      `incomg1` %in% 5:6 ~ "$50K-$99K",
      `incomg1` == 7 ~ "$100K+",
      TRUE ~ NA_character_
    ) |> factor(levels = c("Less than $25K", "$25K-$49K", "$50K-$99K", "$100K+")),

    education = case_when(
      educa %in% 1:3 ~ "Less than HS",
      educa == 4 ~ "HS/GED",
      educa == 5 ~ "Some college",
      educa == 6 ~ "College graduate",
      TRUE ~ NA_character_
    ) |> factor(levels = c("Less than HS", "HS/GED", "Some college", "College graduate")),

    smoking = case_when(
      `smoker3` %in% 1:2 ~ "Current",
      `smoker3` == 3 ~ "Former",
      `smoker3` == 4 ~ "Never",
      TRUE ~ NA_character_
    ) |> factor(levels = c("Never", "Former", "Current"))
  )

brfss_var |>
  select(physhlth_days,menthlth_days,bmi,age_group,income,sex,
education,smoking,exercise
  )  |>
  tbl_summary(
    label = list(
      menthlth_days ~ "Mentally unhealthy days (past 30)",
      physhlth_days ~ "Physically unhealthy days (past 30)",
      bmi           ~ "Body mass index",
      age_group     ~ "Age group",
      income        ~ "Household income",
      sex           ~ "Sex",
      exercise      ~ "Any physical activity (past 30 days)", 
      education     ~ "Highest level of education completed",
      smoking       ~ "Smoking status (4 levels)"
    ),
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1,
    missing = "no"
  ) |>
  add_n() |>
  bold_labels() |>
  modify_caption("**Table 1. Descriptive Statistics — BRFSS 2023**")

**Table 1. Descriptive Statistics — BRFSS 2023**
Characteristic	N	N = 433,323¹
Physically unhealthy days (past 30)	422,538	4.5 (8.8)
Mentally unhealthy days (past 30)	425,215	4.4 (8.3)
Body mass index	392,788	28.5 (6.5)
Age group	425,544
18-34		72,330 (17%)
35-49		82,686 (19%)
50-64		107,484 (25%)
65+		163,044 (38%)
Household income	346,700
Less than $25K		50,256 (14%)
$25K-$49K		86,010 (25%)
$50K-$99K		183,664 (53%)
$100K+		26,770 (7.7%)
Sex	433,323
Male		203,782 (47%)
Female		229,541 (53%)
Highest level of education completed	430,998
Less than HS		25,172 (5.8%)
HS/GED		106,613 (25%)
Some college		114,346 (27%)
College graduate		184,867 (43%)
Smoking status (4 levels)	410,261
Never		251,981 (61%)
Former		113,134 (28%)
Current		45,146 (11%)
Any physical activity (past 30 days)	432,072	325,227 (75%)
¹ Mean (SD); n (%)

#Report the number and percentage of missing observations

total_n <- nrow(brfss_var)

# Missing summary
missing_summary <- brfss_var |>
  summarise(
    menthlth_miss_n = sum(is.na(physhlth_days)),
    menthlth_miss_pct = mean(is.na(physhlth_days)) * 100,
    
    bmi_miss_n = sum(is.na(menthlth_days)),
    bmi_miss_pct = mean(is.na(menthlth_days)) * 100,
    
    smoking_miss_n = sum(is.na(bmi)),
    smoking_miss_pct = mean(is.na(bmi)) * 100
  )
missing_summary

## # A tibble: 1 × 6
##   menthlth_miss_n menthlth_miss_pct bmi_miss_n bmi_miss_pct smoking_miss_n
##             <int>             <dbl>      <int>        <dbl>          <int>
## 1           10785              2.49       8108         1.87          40535
## # ℹ 1 more variable: smoking_miss_pct <dbl>

Analytic Sample

set.seed(1220)

brfss_analytic <- brfss_var |>
  select(physhlth_days,menthlth_days,bmi,age_group,income,sex,
education,smoking,exercise) |>
  drop_na()  |>
  slice_sample(n = 8000)

nrow(brfss_analytic)

## [1] 8000

brfss_analytic |>
  select(physhlth_days,menthlth_days,bmi,age_group,income,sex,
education,smoking,exercise) |>
  tbl_summary(
    
    label = list(
      menthlth_days ~ "Mentally unhealthy days (past 30)",
      physhlth_days ~ "Physically unhealthy days (past 30)",
      bmi           ~ "Body mass index",
      age_group     ~ "Age group",
      sex           ~ "Sex",
      income        ~ "Household income",
      exercise      ~ "Any physical activity (past 30 days)", 
      education     ~ "Highest level of education completed",
      smoking       ~ "Smoking status (4 levels)"
    ),
    
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1,
    missing = "no"
  ) |>
  add_n() |>
  # add_overall() |>
  bold_labels() |>
  #italicize_levels() |>
  modify_caption("**Table 1. Descriptive Statistics by Sex — BRFSS 2023 Analytic Sample (n = 8,000)**")

**Table 1. Descriptive Statistics by Sex — BRFSS 2023 Analytic Sample (n = 8,000)**
Characteristic	N	N = 8,000¹
Physically unhealthy days (past 30)	8,000	4.4 (8.7)
Mentally unhealthy days (past 30)	8,000	4.3 (8.1)
Body mass index	8,000	28.7 (6.5)
Age group	8,000
18-34		1,277 (16%)
35-49		1,665 (21%)
50-64		2,055 (26%)
65+		3,003 (38%)
Household income	8,000
Less than $25K		1,048 (13%)
$25K-$49K		1,938 (24%)
$50K-$99K		4,376 (55%)
$100K+		638 (8.0%)
Sex	8,000
Male		3,936 (49%)
Female		4,064 (51%)
Highest level of education completed	8,000
Less than HS		376 (4.7%)
HS/GED		1,827 (23%)
Some college		2,138 (27%)
College graduate		3,659 (46%)
Smoking status (4 levels)	8,000
Never		4,812 (60%)
Former		2,262 (28%)
Current		926 (12%)
Any physical activity (past 30 days)	8,000	6,240 (78%)
¹ Mean (SD); n (%)

#Your goal is to identify the best linear regression model for predicting physhlth_days (continuous) from your chosen set of at least 8 predictors

Part 1: Model Selection for Linear Regression

Step 1: Fit the Maximum Model

mod_max <- lm(physhlth_days ~ menthlth_days + bmi + age_group + income + sex +
    education + smoking + exercise,
              data = brfss_analytic)

tidy(mod_max, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Maximum Model: All Candidate Predictors",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Maximum Model: All Candidate Predictors
Term	Estimate	SE	t	p-value	CI Lower	CI Upper
(Intercept)	5.7849	0.6589	8.7800	0.0000	4.4934	7.0765
menthlth_days	0.3040	0.0114	26.5864	0.0000	0.2816	0.3264
bmi	0.0481	0.0140	3.4364	0.0006	0.0206	0.0755
age_group35-49	1.3446	0.2986	4.5023	0.0000	0.7592	1.9300
age_group50-64	2.6421	0.2876	9.1876	0.0000	2.0784	3.2058
age_group65+	2.8790	0.2749	10.4742	0.0000	2.3402	3.4178
income$25K-$49K	-2.6229	0.3074	-8.5332	0.0000	-3.2254	-2.0203
income$50K-$99K	-3.2345	0.2961	-10.9223	0.0000	-3.8150	-2.6540
income$100K+	-3.8191	0.4328	-8.8252	0.0000	-4.6674	-2.9708
sexFemale	0.0309	0.1794	0.1720	0.8635	-0.3209	0.3826
educationHS/GED	-1.1996	0.4510	-2.6598	0.0078	-2.0836	-0.3155
educationSome college	-0.7213	0.4530	-1.5924	0.1113	-1.6093	0.1666
educationCollege graduate	-1.2362	0.4538	-2.7240	0.0065	-2.1259	-0.3466
smokingFormer	0.6796	0.2059	3.3008	0.0010	0.2760	1.0832
smokingCurrent	1.1538	0.2971	3.8831	0.0001	0.5713	1.7363
exerciseYes	-3.4495	0.2254	-15.3021	0.0000	-3.8913	-3.0076

glance(mod_max) |>
  dplyr::select(r.squared, adj.r.squared, sigma, AIC, BIC, df.residual) |>
  mutate(across(everything(), \(x) round(x, 3))) |>
  kable(caption = "Maximum Model: Fit Statistics") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Maximum Model: Fit Statistics
r.squared	adj.r.squared	sigma	AIC	BIC	df.residual
0.184	0.183	7.859	55707.42	55826.2	7984

Report and interpret: R-squared, Adjusted R-squared, AIC (using AIC()), and BIC (using BIC() R-squared = 0.184 The model explains about 18.4% of the variation in physically unhealthy days.

Adjusted R-squared = 0.183 After adjusting for the number of predictors, the model explains about 18.3% of the variation. There is almost identical values for r-squared and adjusted r-squared showing that adding predictors did not meaningfully inflate model fit.

AIC = 55,707.42 AIC is a measure of model quality that balances goodness of fit with model complexity. This value is not meaningful on its own, but it is used to compare models—lower AIC indicates a better model when comparing alternative specifications.

BIC = 55826.2 BIC is similar to AIC but penalizes model complexity more strongly. Like AIC, it is only useful for comparison across models. A lower BIC would indicate a more a simpler model with adequate fit. In this case, BIC is greater than AIC.

#Step 2:Best Subsets Regression

best_subsets <- regsubsets(
  physhlth_days ~ menthlth_days + bmi + age_group + income + sex +
    education + smoking + exercise,
              data = brfss_analytic,
 nvmax = 17,      # maximum number of variables to consider
  method = "exhaustive"
)

best_summary <- summary(best_subsets)

subset_metrics <- tibble(
  p = 1:length(best_summary$adjr2),
  `Adj. R²` = best_summary$adjr2,
  BIC = best_summary$bic,
  Cp = best_summary$cp
)

p1 <- ggplot(subset_metrics, aes(x = p, y = `Adj. R²`)) +
  geom_line(linewidth = 1, color = "steelblue") +
  geom_point(size = 3, color = "steelblue") +
  geom_vline(xintercept = which.max(best_summary$adjr2),
             linetype = "dashed", color = "tomato") +
  labs(title = "Adjusted R² by Model Size", x = "Number of Variables", y = "Adjusted R²") +
  theme_minimal(base_size = 12)

p2 <- ggplot(subset_metrics, aes(x = p, y = BIC)) +
  geom_line(linewidth = 1, color = "steelblue") +
  geom_point(size = 3, color = "steelblue") +
  geom_vline(xintercept = which.min(best_summary$bic),
             linetype = "dashed", color = "tomato") +
  labs(title = "BIC by Model Size", x = "Number of Variables", y = "BIC") +
  theme_minimal(base_size = 12)

gridExtra::grid.arrange(p1, p2, ncol = 2)

cat("Best model by Adj. R²:", which.max(best_summary$adjr2), "variables\n")

## Best model by Adj. R²: 14 variables

cat("Best model by BIC:", which.min(best_summary$bic), "variables\n")

## Best model by BIC: 11 variables

# Show which variables are in the BIC-best model
best_bic_idx <- which.min(best_summary$bic)
best_vars <- names(which(best_summary$which[best_bic_idx, -1]))
cat("\nVariables in BIC-best model:\n")

## 
## Variables in BIC-best model:

cat(paste(" ", best_vars), sep = "\n")

##   menthlth_days
##   bmi
##   age_group35-49
##   age_group50-64
##   age_group65+
##   income$25K-$49K
##   income$50K-$99K
##   income$100K+
##   smokingFormer
##   smokingCurrent
##   exerciseYes

In 2-3 sentences, describe what you observe: Do the two criteria agree on the best model size? If not, which criterion favors a simpler model?

The two criteria do not agree on the best model size. The adjusted r-squared model has a model size of about 12, while the BIC graph has a model size of about 9. The adjusted r-squared model continues to increase slightly and then levels off with more variables, while the BIC reaches its minimum and then worsens, showing that a smaller model would have been preferred. The BIC favors a simpler model.

#Step 3: Stepwise Selection Methods

#Backward elimination

mod_max <- lm(physhlth_days ~ menthlth_days + bmi + age_group + income + sex +
    education + smoking + exercise,
              data = brfss_analytic)

cat("=== BACKWARD ELIMINATION ===\n\n")

## === BACKWARD ELIMINATION ===

# Step 1: Maximum model
mod_back <- mod_max
cat("Step 1: Maximum model\n")

## Step 1: Maximum model

cat("Variables:", paste(names(coef(mod_back))[-1], collapse = ", "), "\n")

## Variables: menthlth_days, bmi, age_group35-49, age_group50-64, age_group65+, income$25K-$49K, income$50K-$99K, income$100K+, sexFemale, educationHS/GED, educationSome college, educationCollege graduate, smokingFormer, smokingCurrent, exerciseYes

# Show p-values for the maximum model
pvals <- tidy(mod_back) |>
  filter(term != "(Intercept)") |>
  arrange(desc(p.value)) |>
  dplyr::select(term, estimate, p.value) |>
  mutate(across(where(is.numeric), \(x) round(x, 4)))

pvals |>
  head(5) |>
  kable(caption = "Maximum Model: Variables Sorted by p-value (Highest First)") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Maximum Model: Variables Sorted by p-value (Highest First)
term	estimate	p.value
sexFemale	0.0309	0.8635
educationSome college	-0.7213	0.1113
educationHS/GED	-1.1996	0.0078
educationCollege graduate	-1.2362	0.0065
smokingFormer	0.6796	0.0010

# Automated backward elimination using AIC
mod_backward <- step(mod_max, direction = "backward", trace = 1)

## Start:  AIC=33002.4
## physhlth_days ~ menthlth_days + bmi + age_group + income + sex + 
##     education + smoking + exercise
## 
##                 Df Sum of Sq    RSS   AIC
## - sex            1         2 493117 33000
## <none>                       493115 33002
## - education      3       747 493862 33009
## - bmi            1       729 493844 33012
## - smoking        2      1293 494408 33019
## - income         3      7823 500938 33122
## - age_group      3      8240 501355 33129
## - exercise       1     14462 507577 33232
## - menthlth_days  1     43656 536771 33679
## 
## Step:  AIC=33000.43
## physhlth_days ~ menthlth_days + bmi + age_group + income + education + 
##     smoking + exercise
## 
##                 Df Sum of Sq    RSS   AIC
## <none>                       493117 33000
## - education      3       745 493862 33007
## - bmi            1       728 493845 33010
## - smoking        2      1295 494412 33017
## - income         3      7909 501026 33122
## - age_group      3      8302 501418 33128
## - exercise       1     14480 507597 33230
## - menthlth_days  1     44141 537257 33684

tidy(mod_backward, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Backward Elimination Result (AIC-based)",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Backward Elimination Result (AIC-based)
Term	Estimate	SE	t	p-value	CI Lower	CI Upper
(Intercept)	5.8000	0.6530	8.8821	0.0000	4.5200	7.0800
menthlth_days	0.3042	0.0114	26.7352	0.0000	0.2819	0.3265
bmi	0.0480	0.0140	3.4339	0.0006	0.0206	0.0754
age_group35-49	1.3477	0.2981	4.5215	0.0000	0.7634	1.9320
age_group50-64	2.6452	0.2870	9.2171	0.0000	2.0826	3.2077
age_group65+	2.8828	0.2740	10.5221	0.0000	2.3457	3.4198
income$25K-$49K	-2.6241	0.3073	-8.5398	0.0000	-3.2264	-2.0217
income$50K-$99K	-3.2385	0.2952	-10.9701	0.0000	-3.8172	-2.6598
income$100K+	-3.8271	0.4303	-8.8949	0.0000	-4.6705	-2.9837
educationHS/GED	-1.1978	0.4508	-2.6567	0.0079	-2.0815	-0.3140
educationSome college	-0.7178	0.4525	-1.5863	0.1127	-1.6048	0.1692
educationCollege graduate	-1.2319	0.4531	-2.7188	0.0066	-2.1201	-0.3437
smokingFormer	0.6768	0.2052	3.2977	0.0010	0.2745	1.0791
smokingCurrent	1.1503	0.2964	3.8807	0.0001	0.5692	1.7313
exerciseYes	-3.4505	0.2253	-15.3127	0.0000	-3.8922	-3.0088

#Forward selection
brfss_analytic <- na.omit(brfss_analytic)

# Null model
mod_null <- lm(physhlth_days ~ 1, data = brfss_analytic)

# Full model (DO NOT redefine if already created earlier — but kept here for safety)
mod_max <- lm(
  physhlth_days ~ menthlth_days + bmi + age_group + income + sex +
    education + smoking + exercise,
  data = brfss_analytic
)

cat("=== FORWARD SELECTION ===\n\n")

## === FORWARD SELECTION ===

cat("Step 1: Null model\n")

## Step 1: Null model

cat("Variables: Intercept only\n")

## Variables: Intercept only

# Automated forward selection using AIC
mod_forward <- step(
  mod_null,
  scope = list(lower = mod_null, upper = mod_max),
  direction = "forward",
  trace = 1
)

## Start:  AIC=34603.45
## physhlth_days ~ 1
## 
##                 Df Sum of Sq    RSS   AIC
## + menthlth_days  1     59071 545563 33783
## + exercise       1     37706 566927 34090
## + income         3     32443 572191 34168
## + smoking        2     12018 592616 34447
## + education      3     11873 592761 34451
## + age_group      3      5515 599119 34536
## + bmi            1      4732 599901 34543
## + sex            1      1559 603075 34585
## <none>                       604634 34603
## 
## Step:  AIC=33783.01
## physhlth_days ~ menthlth_days
## 
##             Df Sum of Sq    RSS   AIC
## + exercise   1   28346.8 517216 33358
## + income     3   19686.5 525876 33495
## + age_group  3   14455.2 531108 33574
## + education  3    8240.9 537322 33667
## + smoking    2    6070.4 539492 33697
## + bmi        1    2821.0 542742 33744
## + sex        1     266.4 545296 33781
## <none>                   545563 33783
## 
## Step:  AIC=33358.15
## physhlth_days ~ menthlth_days + exercise
## 
##             Df Sum of Sq    RSS   AIC
## + income     3   11764.1 505452 33180
## + age_group  3    9921.3 507295 33209
## + smoking    2    3952.9 513263 33301
## + education  3    3409.0 513807 33311
## + bmi        1     753.5 516463 33348
## <none>                   517216 33358
## + sex        1     106.7 517109 33359
## 
## Step:  AIC=33180.09
## physhlth_days ~ menthlth_days + exercise + income
## 
##             Df Sum of Sq    RSS   AIC
## + age_group  3    9400.2 496052 33036
## + smoking    2    2483.5 502968 33145
## + bmi        1     781.6 504670 33170
## + education  3     937.1 504515 33171
## <none>                   505452 33180
## + sex        1       6.7 505445 33182
## 
## Step:  AIC=33035.91
## physhlth_days ~ menthlth_days + exercise + income + age_group
## 
##             Df Sum of Sq    RSS   AIC
## + smoking    2   1427.13 494625 33017
## + education  3   1013.78 495038 33026
## + bmi        1    666.76 495385 33027
## <none>                   496052 33036
## + sex        1     14.93 496037 33038
## 
## Step:  AIC=33016.86
## physhlth_days ~ menthlth_days + exercise + income + age_group + 
##     smoking
## 
##             Df Sum of Sq    RSS   AIC
## + bmi        1    762.78 493862 33007
## + education  3    779.94 493845 33010
## <none>                   494625 33017
## + sex        1      0.00 494625 33019
## 
## Step:  AIC=33006.52
## physhlth_days ~ menthlth_days + exercise + income + age_group + 
##     smoking + bmi
## 
##             Df Sum of Sq    RSS   AIC
## + education  3    745.36 493117 33000
## <none>                   493862 33007
## + sex        1      0.35 493862 33009
## 
## Step:  AIC=33000.43
## physhlth_days ~ menthlth_days + exercise + income + age_group + 
##     smoking + bmi + education
## 
##        Df Sum of Sq    RSS   AIC
## <none>              493117 33000
## + sex   1    1.8268 493115 33002

tidy(mod_forward, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Forward Selection Result (AIC-based)",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Forward Selection Result (AIC-based)
Term	Estimate	SE	t	p-value	CI Lower	CI Upper
(Intercept)	5.8000	0.6530	8.8821	0.0000	4.5200	7.0800
menthlth_days	0.3042	0.0114	26.7352	0.0000	0.2819	0.3265
exerciseYes	-3.4505	0.2253	-15.3127	0.0000	-3.8922	-3.0088
income$25K-$49K	-2.6241	0.3073	-8.5398	0.0000	-3.2264	-2.0217
income$50K-$99K	-3.2385	0.2952	-10.9701	0.0000	-3.8172	-2.6598
income$100K+	-3.8271	0.4303	-8.8949	0.0000	-4.6705	-2.9837
age_group35-49	1.3477	0.2981	4.5215	0.0000	0.7634	1.9320
age_group50-64	2.6452	0.2870	9.2171	0.0000	2.0826	3.2077
age_group65+	2.8828	0.2740	10.5221	0.0000	2.3457	3.4198
smokingFormer	0.6768	0.2052	3.2977	0.0010	0.2745	1.0791
smokingCurrent	1.1503	0.2964	3.8807	0.0001	0.5692	1.7313
bmi	0.0480	0.0140	3.4339	0.0006	0.0206	0.0754
educationHS/GED	-1.1978	0.4508	-2.6567	0.0079	-2.0815	-0.3140
educationSome college	-0.7178	0.4525	-1.5863	0.1127	-1.6048	0.1692
educationCollege graduate	-1.2319	0.4531	-2.7188	0.0066	-2.1201	-0.3437

#Stepwise selection
mod_stepwise <- step(mod_null,
                     scope = list(lower = mod_null, upper = mod_max),
                     direction = "both", trace = 1)

## Start:  AIC=34603.45
## physhlth_days ~ 1
## 
##                 Df Sum of Sq    RSS   AIC
## + menthlth_days  1     59071 545563 33783
## + exercise       1     37706 566927 34090
## + income         3     32443 572191 34168
## + smoking        2     12018 592616 34447
## + education      3     11873 592761 34451
## + age_group      3      5515 599119 34536
## + bmi            1      4732 599901 34543
## + sex            1      1559 603075 34585
## <none>                       604634 34603
## 
## Step:  AIC=33783.01
## physhlth_days ~ menthlth_days
## 
##                 Df Sum of Sq    RSS   AIC
## + exercise       1     28347 517216 33358
## + income         3     19686 525876 33495
## + age_group      3     14455 531108 33574
## + education      3      8241 537322 33667
## + smoking        2      6070 539492 33697
## + bmi            1      2821 542742 33744
## + sex            1       266 545296 33781
## <none>                       545563 33783
## - menthlth_days  1     59071 604634 34603
## 
## Step:  AIC=33358.15
## physhlth_days ~ menthlth_days + exercise
## 
##                 Df Sum of Sq    RSS   AIC
## + income         3     11764 505452 33180
## + age_group      3      9921 507295 33209
## + smoking        2      3953 513263 33301
## + education      3      3409 513807 33311
## + bmi            1       754 516463 33348
## <none>                       517216 33358
## + sex            1       107 517109 33359
## - exercise       1     28347 545563 33783
## - menthlth_days  1     49711 566927 34090
## 
## Step:  AIC=33180.09
## physhlth_days ~ menthlth_days + exercise + income
## 
##                 Df Sum of Sq    RSS   AIC
## + age_group      3      9400 496052 33036
## + smoking        2      2484 502968 33145
## + bmi            1       782 504670 33170
## + education      3       937 504515 33171
## <none>                       505452 33180
## + sex            1         7 505445 33182
## - income         3     11764 517216 33358
## - exercise       1     20424 525876 33495
## - menthlth_days  1     41532 546984 33810
## 
## Step:  AIC=33035.91
## physhlth_days ~ menthlth_days + exercise + income + age_group
## 
##                 Df Sum of Sq    RSS   AIC
## + smoking        2      1427 494625 33017
## + education      3      1014 495038 33026
## + bmi            1       667 495385 33027
## <none>                       496052 33036
## + sex            1        15 496037 33038
## - age_group      3      9400 505452 33180
## - income         3     11243 507295 33209
## - exercise       1     17199 513251 33307
## - menthlth_days  1     47177 543229 33761
## 
## Step:  AIC=33016.86
## physhlth_days ~ menthlth_days + exercise + income + age_group + 
##     smoking
## 
##                 Df Sum of Sq    RSS   AIC
## + bmi            1       763 493862 33007
## + education      3       780 493845 33010
## <none>                       494625 33017
## + sex            1         0 494625 33019
## - smoking        2      1427 496052 33036
## - age_group      3      8344 502968 33145
## - income         3      9897 504522 33169
## - exercise       1     16705 511329 33281
## - menthlth_days  1     44789 539414 33708
## 
## Step:  AIC=33006.52
## physhlth_days ~ menthlth_days + exercise + income + age_group + 
##     smoking + bmi
## 
##                 Df Sum of Sq    RSS   AIC
## + education      3       745 493117 33000
## <none>                       493862 33007
## + sex            1         0 493862 33009
## - bmi            1       763 494625 33017
## - smoking        2      1523 495385 33027
## - age_group      3      8245 502107 33133
## - income         3      9781 503643 33157
## - exercise       1     15215 509077 33247
## - menthlth_days  1     44206 538068 33690
## 
## Step:  AIC=33000.43
## physhlth_days ~ menthlth_days + exercise + income + age_group + 
##     smoking + bmi + education
## 
##                 Df Sum of Sq    RSS   AIC
## <none>                       493117 33000
## + sex            1         2 493115 33002
## - education      3       745 493862 33007
## - bmi            1       728 493845 33010
## - smoking        2      1295 494412 33017
## - income         3      7909 501026 33122
## - age_group      3      8302 501418 33128
## - exercise       1     14480 507597 33230
## - menthlth_days  1     44141 537257 33684

tidy(mod_stepwise, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Stepwise Selection Result (AIC-based)",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Stepwise Selection Result (AIC-based)
Term	Estimate	SE	t	p-value	CI Lower	CI Upper
(Intercept)	5.8000	0.6530	8.8821	0.0000	4.5200	7.0800
menthlth_days	0.3042	0.0114	26.7352	0.0000	0.2819	0.3265
exerciseYes	-3.4505	0.2253	-15.3127	0.0000	-3.8922	-3.0088
income$25K-$49K	-2.6241	0.3073	-8.5398	0.0000	-3.2264	-2.0217
income$50K-$99K	-3.2385	0.2952	-10.9701	0.0000	-3.8172	-2.6598
income$100K+	-3.8271	0.4303	-8.8949	0.0000	-4.6705	-2.9837
age_group35-49	1.3477	0.2981	4.5215	0.0000	0.7634	1.9320
age_group50-64	2.6452	0.2870	9.2171	0.0000	2.0826	3.2077
age_group65+	2.8828	0.2740	10.5221	0.0000	2.3457	3.4198
smokingFormer	0.6768	0.2052	3.2977	0.0010	0.2745	1.0791
smokingCurrent	1.1503	0.2964	3.8807	0.0001	0.5692	1.7313
bmi	0.0480	0.0140	3.4339	0.0006	0.0206	0.0754
educationHS/GED	-1.1978	0.4508	-2.6567	0.0079	-2.0815	-0.3140
educationSome college	-0.7178	0.4525	-1.5863	0.1127	-1.6048	0.1692
educationCollege graduate	-1.2319	0.4531	-2.7188	0.0066	-2.1201	-0.3437

In 3-5 sentences, compare the results: • Do the three methods agree on the same final model? • Which variables (if any) were excluded by all three methods? • Which variables were retained by all three methods?

The 3 models all produced the same final mode, indicating strong agreement in variable selection. No variables were excluded by all three models since each method has identical outcomes. The variables retained by all three methods include: menthlth_days, bmi, age_group, income, education, smoking, and exerciseYes. This shows that all predictors are consistently important under all selection models. #Step 4: Final Model Selection and Interpretation

tidy(mod_forward, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Final Model: Coefficients with Confidence Intervals",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Final Model: Coefficients with Confidence Intervals
Term	Estimate	SE	t	p-value	CI Lower	CI Upper
(Intercept)	5.8000	0.6530	8.8821	0.0000	4.5200	7.0800
menthlth_days	0.3042	0.0114	26.7352	0.0000	0.2819	0.3265
exerciseYes	-3.4505	0.2253	-15.3127	0.0000	-3.8922	-3.0088
income$25K-$49K	-2.6241	0.3073	-8.5398	0.0000	-3.2264	-2.0217
income$50K-$99K	-3.2385	0.2952	-10.9701	0.0000	-3.8172	-2.6598
income$100K+	-3.8271	0.4303	-8.8949	0.0000	-4.6705	-2.9837
age_group35-49	1.3477	0.2981	4.5215	0.0000	0.7634	1.9320
age_group50-64	2.6452	0.2870	9.2171	0.0000	2.0826	3.2077
age_group65+	2.8828	0.2740	10.5221	0.0000	2.3457	3.4198
smokingFormer	0.6768	0.2052	3.2977	0.0010	0.2745	1.0791
smokingCurrent	1.1503	0.2964	3.8807	0.0001	0.5692	1.7313
bmi	0.0480	0.0140	3.4339	0.0006	0.0206	0.0754
educationHS/GED	-1.1978	0.4508	-2.6567	0.0079	-2.0815	-0.3140
educationSome college	-0.7178	0.4525	-1.5863	0.1127	-1.6048	0.1692
educationCollege graduate	-1.2319	0.4531	-2.7188	0.0066	-2.1201	-0.3437

Choose your final model. State which method(s) guided your choice and why. Display the coefficient table for your final model using tidy() with confidence interval. Interpret the coefficients for at least three predictors in plain language, including units and “holding all other variables constant” language. Include at least one continuous and one categorical predictor.

The final model was selected using stepwise selection. This model was chosen because it evaluates predictors in both directions and provides a balance between model fits. Since all three model selections all produced the same final set of predictors, the results were consistent and suggest a stable model.

Holding all other variables constant, each one-unit increase in BMI is associated with an increase of approximately 0.048 physically unhealthy days in the past 30 days. This shows that higher BMI is associated with worse physical health outcomes.

Holding all other variables constant, each additional mentally unhealthy day is associated with an increase of approximately 0.304 physically unhealthy days. This shows a strong relationship between mental and physical health, where worse mental health corresponds to worse physical health.

Holding all other variables constant, individuals who reported any physical activity in the past 30 days had on average 3.45 fewer physically unhealthy days compared to those who did not exercise. This shows that physical activity is strongly associated with better physical health.

Holding all other variables constant, individuals earning $100K+ annually reported about 3.83 fewer physically unhealthy days compared to the lowest income group. This shows that the higher income groups are associated with fewer physically unhealthy days compared to lower income groups.

#Step 5: LINE Assumption Check

par(mfrow = c(2, 2))
plot(mod_forward)  # or your final model name

par(mfrow = c(1, 1))

Residuals vs. Fitted: Is there evidence of non-linearity or non-constant variance?

There is no evidence of non-linearity and non-constant variance.

Normal Q-Q: Do the residuals approximately follow a normal distribution? Where do they deviate?

The residuals do not perfectly follow a normal distribution. They align fairly well in the middle but deviate in the tails, especially the upper tail, indicating potential skewness or heavy tails.

Scale-Location: Is the spread of residuals roughly constant across fitted values?

The spread of residuals is not constant across fitted values. The upward trend suggests increasing variance as fitted values increase.

Residuals vs. Leverage: Are there any highly influential observations (large Cook’s distance)

There are no strongly influential observations. While a few points have slightly higher leverage, non appear to exeed cooks distnace thresholds.

Write a brief conclusion (2-3 sentences): Are the LINE assumptions reasonably satisfied? What violations, if any, do you observe?

The LINE assumptions are somewhat satisfied. Linearity and constant variance are somewhat violated due to the visible patterns and heteroscedasticity, and normality is imperfect due to tail deviations. However, there are no major issues with influential observations, so the model may still be usable with cation.

#Part 2: Logistic Regression #Step 1: Create the Binary Outcome

brfss_new <- brfss_analytic |>
  mutate(
    frequent_distress = ifelse(physhlth_days >= 14, 1, 0),
    frequent_distress = factor(frequent_distress,
                               levels = c(0, 1),
                               labels = c("No", "Yes"))
  )

brfss_new |>
  count(frequent_distress) |>
  mutate(percent = n / sum(n) * 100) |>
  mutate(percent = round(percent, 1)) |>
  kable(
    caption = "Prevalence of Frequent Physical Distress",
    col.names = c("Frequent Distress", "Count", "Percent")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Prevalence of Frequent Physical Distress
Frequent Distress	Count	Percent
No	6914	86.4
Yes	1086	13.6

In 1-2 sentences, comment on the balance of the outcome.

Approximately 13.6% of respondents report frequent physical distress, while 86% do not report frequent physical distress. This shows that the outcome is moderately imbalanced, with a much larger proportion of individuals in the “no” category, which is typical for health burden indicators.

#Step 2: Simple (Unadjusted) Logistic Regression

Choose one predictor from your final linear model that you expect to have a strong association with physical distress. State why you chose it.

I chose mentally unhealthy days because it is a key component of overall health status and is strongly linked to physical health outcomes through biological and behavioral pathways.

mod_ment <- glm(frequent_distress ~ menthlth_days, data = brfss_new,
                    family = binomial(link = "logit"))


tidy(mod_ment, conf.int = TRUE, exponentiate = FALSE) |>
  kable(digits = 3, caption = "Simple Logistic Regression: FPD ~ mentally unhealthy days (Log-Odds Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Simple Logistic Regression: FPD ~ mentally unhealthy days (Log-Odds Scale)
term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	-2.281	0.042	-54.333	0	-2.364	-2.199
menthlth_days	0.070	0.003	22.119	0	0.064	0.077

mod_ment |>
  tbl_regression(
    exponentiate = TRUE,
    label = list(menthlth_days ~ "Mentally unhealthy days in past 30 days")
  ) |>
  bold_labels() |>
  bold_p()

Characteristic	OR	95% CI	p-value
Mentally unhealthy days in past 30 days	1.07	1.07, 1.08	<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

Interpret the odds ratio in plain language. State whether the association is statistically significant at alpha = 0.05 and how you know (Wald test p-value or CI excluding 1.0)

For every one unit increase in mentally unhealthy days, the odds of frequent physical distress are 1.08, with a 95% CI of [1.07,1.08], holding all else constant. This means that individuals with more mentally unhealthy days have higher odds of experiencing frequent physical distress. The association is statistically significant with a p value less than 0.05 (<0.001) and the CI does not include 1.

#Step 3: Multiple Logistic Regression

mod_logistic <- glm(
  frequent_distress ~ menthlth_days + bmi + age_group + income + sex +
    education + smoking + exercise,
  data = brfss_new,
  family = binomial)
summary(mod_logistic)

## 
## Call:
## glm(formula = frequent_distress ~ menthlth_days + bmi + age_group + 
##     income + sex + education + smoking + exercise, family = binomial, 
##     data = brfss_new)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -2.255697   0.243962  -9.246  < 2e-16 ***
## menthlth_days              0.068198   0.003664  18.611  < 2e-16 ***
## bmi                        0.014348   0.005155   2.783 0.005380 ** 
## age_group35-49             0.762689   0.147703   5.164 2.42e-07 ***
## age_group50-64             1.213299   0.139855   8.675  < 2e-16 ***
## age_group65+               1.291576   0.137163   9.416  < 2e-16 ***
## income$25K-$49K           -0.589866   0.101555  -5.808 6.31e-09 ***
## income$50K-$99K           -0.853635   0.100733  -8.474  < 2e-16 ***
## income$100K+              -1.498863   0.230273  -6.509 7.56e-11 ***
## sexFemale                  0.014039   0.073171   0.192 0.847851    
## educationHS/GED           -0.336043   0.149017  -2.255 0.024129 *  
## educationSome college     -0.148256   0.149889  -0.989 0.322613    
## educationCollege graduate -0.494821   0.155068  -3.191 0.001418 ** 
## smokingFormer              0.296136   0.081651   3.627 0.000287 ***
## smokingCurrent             0.358801   0.105610   3.397 0.000680 ***
## exerciseYes               -0.885038   0.076998 -11.494  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6354.8  on 7999  degrees of freedom
## Residual deviance: 5280.5  on 7984  degrees of freedom
## AIC: 5312.5
## 
## Number of Fisher Scoring iterations: 6

tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 3))) |>
  kable(
    caption = "Adjusted Odds Ratios for Frequent Physical Distress",
    digits = 3
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Adjusted Odds Ratios for Frequent Physical Distress
term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	0.105	0.244	-9.246	0.000	0.065	0.168
menthlth_days	1.071	0.004	18.611	0.000	1.063	1.078
bmi	1.014	0.005	2.783	0.005	1.004	1.025
age_group35-49	2.144	0.148	5.164	0.000	1.610	2.875
age_group50-64	3.365	0.140	8.675	0.000	2.569	4.448
age_group65+	3.639	0.137	9.416	0.000	2.795	4.787
income$25K-$49K	0.554	0.102	-5.808	0.000	0.454	0.677
income$50K-$99K	0.426	0.101	-8.474	0.000	0.350	0.519
income$100K+	0.223	0.230	-6.509	0.000	0.139	0.345
sexFemale	1.014	0.073	0.192	0.848	0.879	1.171
educationHS/GED	0.715	0.149	-2.255	0.024	0.535	0.960
educationSome college	0.862	0.150	-0.989	0.323	0.645	1.160
educationCollege graduate	0.610	0.155	-3.191	0.001	0.451	0.829
smokingFormer	1.345	0.082	3.627	0.000	1.145	1.577
smokingCurrent	1.432	0.106	3.397	0.001	1.162	1.758
exerciseYes	0.413	0.077	-11.494	0.000	0.355	0.480

Interpret the adjusted odds ratios for at least three predictors in plain language, using “holding all other variables constant” language. Include at least one continuous and one categorical predictor.

Holding all other variables constant, each additional of mentally unhealthy days is associated with 1.071 times the odds of frequent psychical distress(95% CI: 1.063 to 1.078). This shows that the worse mental health is, the higher odds of frequent physical distress after adjusting for other predictors.

Holding all other variables constant, each one-unit increase in BMI is associated with 1.014 times the odds of frequent physical distress (95% CI: 1.004 to 1.025). This shows that the higher the BMI, the higher the odds of frequent physical distress.

Holding all other variables constant, individuals who reported any physical activity in the past 30 days have 0.413 times the odds of frequent physical distress compared to those who did not exercise (95% CI: 0.355 to 0.480).This shows that physical activity is strongly associated with lower odds of frequent physical distress. #Step 4: Likelihood Ratio Test

mod_reduced <- glm(
  frequent_distress ~ menthlth_days + bmi + age_group + income + sex +
    smoking + exercise,
  data = brfss_new,
  family = binomial
)

anova(mod_reduced, mod_logistic, test = "LRT") |>
  kable(digits = 3,
        caption = "LR Test: Does removing education improve the model?") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

LR Test: Does removing education improve the model?
Resid. Df	Resid. Dev	Df	Deviance	Pr(>Chi)
7987	5299.505	NA	NA	NA
7984	5280.523	3	18.982	0

• State the null and alternative hypotheses • Report the test statistic (deviance), degrees of freedom, and p-value • Conclude at alpha = 0.05: Does this group of predictors significantly improve the model?

Null hypothesis: Education level is not associated with frequent physical distress(all education coefficients=0).

Alternative hypothesis: Education level is associated with frequent physical distress(at least one education coefficients≠0).

Chi-square (Deviance) = 18.982 Degrees of freedom = 3 p-value = <0.001

At a p-value of less than 0.05, we can reject the null hypothesis because the p-value is <0.001. This group of predictors significantly improves the model.

#Step 5: ROC Curve and AUC

roc_obj <- roc(
  response = brfss_new$frequent_distress,
  predictor = fitted(mod_logistic),
  levels = c("No", "Yes"),
  direction = "<"
)

auc_value <- auc(roc_obj)

ggroc(roc_obj, color = "steelblue", linewidth = 1.2) +
  geom_abline(slope = 1, intercept = 1, linetype = "dashed", color = "red") +
  labs(title = "ROC Curve for Frequent Mental Distress Model",
       subtitle = paste0("AUC = ", round(auc_value, 3)),
       x = "Specificity", y = "Sensitivity") +
  theme_minimal()

Interpret the AUC: What does this value tell you about the model’s ability to discriminate between individuals with and without frequent physical distress? Use the following guide: • 0.5 = no discrimination (coin flip) • 0.5-0.7 = poor discrimination • 0.7-0.8 = acceptable discrimination • 0.8-0.9 = excellent discrimination • 0.9+ = outstanding discrimination

The area under the ROC curve for is 0.777.This shows an acceptable discrimination, meaning the model has a good ability to distinguish between individuals with and without frequent physical distress. #Step 6: Hosmer-Lemeshow Goodness-of-Fit Test

hl_test <- hoslem.test(
  x = as.numeric(brfss_new$frequent_distress) - 1,
  y = fitted(mod_logistic),
  g = 10
)

hl_test

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  as.numeric(brfss_new$frequent_distress) - 1, fitted(mod_logistic)
## X-squared = 11.097, df = 8, p-value = 0.1963

State the null hypothesis (the model fits the data well) and alternative hypothesis (the model does not fit the data well) • Report the test statistic, degrees of freedom, and p-value • Interpret: At alpha = 0.05, is there evidence of poor model fit? • In 1-2 sentences, discuss how the Hosmer-Lemeshow result complements the ROC/AUC finding

Null hypothesis (H₀): The logistic regression model fits the data well (no evidence of poor fit).

Alternative hypothesis (H₁): The logistic regression model does not fit the data well (evidence of poor fit).

Chi-square (X²) = 11.097 Degrees of freedom = 8 p-value = 0.1963

We fail to reject the null hypothesis because the p-value (0.1963) is greater than 0.05, This shows no evidence of poor model fit, suggesting the model fits the data adequately. The Hosmer-Lemeshow test evaluates calibration, while ROC/AUC measures discrimination. Together they suggest the model both fits the data reasonably well and have acceptable performance.

#Part 3: Reflection Write 250-400 words in continuous prose (no bullet points). A. Statistical Insights (5 points): What do your results suggest about the factors associated with physical health burden among U.S. adults? Which predictors were the strongest in both the linear and logistic models? Were any predictors significant in one model but not the other? What are the key limitations of using cross-sectional survey data for this type of analysis? Name at least two specific potential confounders not measured in your model.

The results suggest that both behavioral and socioeconomic predictors play an important role in physical health. Across both the linear and logistic models, mentally unhealthy days emerged as one of the strongest predictors showing a large and statistically significant association with worse health outcomes. In addition, exercise demonstrated a strong effect as well with individuals who reported physical activity having fewer unhealthy days and lower odds of frequent physical distress than those who didn’t participant in physical activity. Income was associated with better health for those who had higher income. For age, individuals who were older had worse outcomes. Most predictors were consistent across both models, although some variables were not statistically significant in one model while appearing more relevant in other. A key to using cross-sectional data is the lack of causality. In addition, there may be confounding variables that were not measured that could be influencing the preditors and outcomes.

B. Linear versus Logistic Regression (5 points): Compare what the linear regression model tells you versus what the logistic regression model tells you about the same research question. What information does each approach provide that the other does not? In what situations would you prefer one approach over the other? How do the model selection criteria differ between the two frameworks (for example, R-squared versus AUC)?

The linear model estimates changes in the number of physically unhealthy days with a direct interpretation in terms of units, while the logistic model estimates odds of experiencing frequent physical distress which is used to see who is classified as high risk. The linear model uses r-squared to assess variance, while the logistic models uses measures like AUC to evaluate classification performance. Linear regression is preferred when the outcome is continuous, while logistic regression is used when the outcome is binary.

C. AI Transparency (5 points): Describe any parts of this assignment where you sought AI assistance. Include the specific prompts you used and how you verified the correctness of the output. If you did not use AI, state this explicitly.

I used AI to help with debugging. For example, when my code was not running for initial setup of brfss_2023, I asked co-pilot “why is my code not running” and I would insert the code that I had written. For initial set up, I had used “clean_names” function, which standardized variable names by removing special characters, so I did not need to include the “_BMI” and I needed to just have “BMI” for all the predictors like this, which at the time I did not realize. Co-pilot was able to tell me to take the underscore away and just use the name.

Assignment 4

Grace Beal

April 24, 2026

Setup and Data

Part 0: Data Acquisition and Preparation

Analytic Sample

Part 1: Model Selection for Linear Regression

Step 1: Fit the Maximum Model