HW 4

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.4.3

## Warning: package 'ggplot2' was built under R version 4.4.3

## Warning: package 'tibble' was built under R version 4.4.3

## Warning: package 'tidyr' was built under R version 4.4.3

## Warning: package 'readr' was built under R version 4.4.3

## Warning: package 'purrr' was built under R version 4.4.3

## Warning: package 'dplyr' was built under R version 4.4.3

## Warning: package 'stringr' was built under R version 4.4.3

## Warning: package 'forcats' was built under R version 4.4.3

## Warning: package 'lubridate' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(haven)

## Warning: package 'haven' was built under R version 4.4.3

library(broom)

## Warning: package 'broom' was built under R version 4.4.3

library(kableExtra)

## Warning: package 'kableExtra' was built under R version 4.4.3

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(car)

## Warning: package 'car' was built under R version 4.4.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.4.3

## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

library(gtsummary)

## Warning: package 'gtsummary' was built under R version 4.4.3

library(leaps)

## Warning: package 'leaps' was built under R version 4.4.3

library(pROC)

## Warning: package 'pROC' was built under R version 4.4.3

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(ResourceSelection)

## Warning: package 'ResourceSelection' was built under R version 4.4.3

## ResourceSelection 0.3-6   2023-06-27

brfss_data <- read_xpt("/Users/sarah/OneDrive/Documents/EPI 553/data/LLCP2023.XPT") %>%
  janitor::clean_names()

names(brfss_data)

##   [1] "state"    "fmonth"   "idate"    "imonth"   "iday"     "iyear"   
##   [7] "dispcode" "seqno"    "psu"      "ctelenm1" "pvtresd1" "colghous"
##  [13] "statere1" "celphon1" "ladult1"  "numadult" "respslc1" "landsex2"
##  [19] "lndsxbrt" "safetime" "ctelnum1" "cellfon5" "cadult1"  "cellsex2"
##  [25] "celsxbrt" "pvtresd3" "cclghous" "cstate1"  "landline" "hhadult" 
##  [31] "sexvar"   "genhlth"  "physhlth" "menthlth" "poorhlth" "primins1"
##  [37] "persdoc3" "medcost1" "checkup1" "exerany2" "exract12" "exeroft1"
##  [43] "exerhmm1" "exract22" "exeroft2" "exerhmm2" "strength" "bphigh6" 
##  [49] "bpmeds1"  "cholchk3" "toldhi3"  "cholmed3" "cvdinfr4" "cvdcrhd4"
##  [55] "cvdstrk3" "asthma3"  "asthnow"  "chcscnc1" "chcocnc1" "chccopd3"
##  [61] "addepev3" "chckdny2" "havarth4" "diabete4" "diabage4" "marital" 
##  [67] "educa"    "renthom1" "numhhol4" "numphon4" "cpdemo1c" "veteran3"
##  [73] "employ1"  "children" "income3"  "pregnant" "weight2"  "height3" 
##  [79] "deaf"     "blind"    "decide"   "diffwalk" "diffdres" "diffalon"
##  [85] "fall12mn" "fallinj5" "smoke100" "smokday2" "usenow3"  "ecignow2"
##  [91] "alcday4"  "avedrnk3" "drnk3ge5" "maxdrnks" "flushot7" "flshtmy3"
##  [97] "pneuvac4" "shingle2" "hivtst7"  "hivtstd3" "seatbelt" "drnkdri2"
## [103] "covidpo1" "covidsm1" "covidact" "pdiabts1" "prediab2" "diabtype"
## [109] "insulin1" "chkhemo3" "eyeexam1" "diabeye1" "diabedu1" "feetsore"
## [115] "arthexer" "arthedu"  "lmtjoin3" "arthdis2" "joinpai2" "lcsfirst"
## [121] "lcslast"  "lcsnumcg" "lcsctsc1" "lcsscncr" "lcsctwhn" "hadmam"  
## [127] "howlong"  "cervscrn" "crvclcnc" "crvclpap" "crvclhpv" "hadhyst2"
## [133] "psatest1" "psatime1" "pcpsars2" "psasugs1" "pcstalk2" "hadsigm4"
## [139] "colnsigm" "colntes1" "sigmtes1" "lastsig4" "colncncr" "vircolo1"
## [145] "vclntes2" "smalstol" "stoltest" "stooldn2" "bldstfit" "sdnates1"
## [151] "cncrdiff" "cncrage"  "cncrtyp2" "csrvtrt3" "csrvdoc1" "csrvsum" 
## [157] "csrvrtrn" "csrvinst" "csrvinsr" "csrvdein" "csrvclin" "csrvpain"
## [163] "csrvctl2" "indortan" "numburn3" "sunprtct" "wkdayout" "wkendout"
## [169] "cimemlo1" "cdworry"  "cddiscu1" "cdhous1"  "cdsocia1" "caregiv1"
## [175] "crgvrel4" "crgvlng1" "crgvhrs1" "crgvprb3" "crgvalzd" "crgvper1"
## [181] "crgvhou1" "crgvexpt" "lastsmk2" "stopsmk2" "mentcigs" "mentecig"
## [187] "heattbco" "firearm5" "gunload"  "loadulk2" "hasymp1"  "hasymp2" 
## [193] "hasymp3"  "hasymp4"  "hasymp5"  "hasymp6"  "strsymp1" "strsymp2"
## [199] "strsymp3" "strsymp4" "strsymp5" "strsymp6" "firstaid" "aspirin" 
## [205] "birthsex" "somale"   "sofemale" "trnsgndr" "marijan1" "marjsmok"
## [211] "marjeat"  "marjvape" "marjdab"  "marjothr" "usemrjn4" "acedeprs"
## [217] "acedrink" "acedrugs" "aceprisn" "acedivrc" "acepunch" "acehurt1"
## [223] "aceswear" "acetouch" "acetthem" "acehvsex" "aceadsaf" "aceadned"
## [229] "imfvpla4" "hpvadvc4" "hpvadsht" "tetanus1" "covidva1" "covacge1"
## [235] "covidnu2" "lsatisfy" "emtsuprt" "sdlonely" "sdhemply" "foodstmp"
## [241] "sdhfood1" "sdhbills" "sdhutils" "sdhtrnsp" "sdhstre1" "rrclass3"
## [247] "rrcognt2" "rrtreat"  "rratwrk2" "rrhcare4" "rrphysm2" "rcsgend1"
## [253] "rcsxbrth" "rcsrltn2" "casthdx2" "casthno2" "qstver"   "qstlang" 
## [259] "metstat"  "urbstat"  "mscode"   "ststr"    "strwt"    "rawrake" 
## [265] "wt2rake"  "imprace"  "chispnc"  "crace1"   "cageg"    "cllcpwt" 
## [271] "dualuse"  "dualcor"  "llcpwt2"  "llcpwt"   "rfhlth"   "phys14d" 
## [277] "ment14d"  "hlthpl1"  "hcvu653"  "totinda"  "metvl12"  "metvl22" 
## [283] "maxvo21"  "fc601"    "actin13"  "actin23"  "padur1"   "padur2"  
## [289] "pafreq1"  "pafreq2"  "minac12"  "minac22"  "strfreq"  "pamiss3" 
## [295] "pamin13"  "pamin23"  "pa3min"   "pavig13"  "pavig23"  "pa3vigm" 
## [301] "pacat3"   "paindx3"  "pa150r4"  "pa300r4"  "pa30023"  "pastrng" 
## [307] "parec3"   "pastae3"  "rfhype6"  "cholch3"  "rfchol3"  "michd"   
## [313] "ltasth1"  "casthm1"  "asthms1"  "drdxar2"  "mrace1"   "hispanc" 
## [319] "race"     "raceg21"  "racegr3"  "raceprv"  "sex"      "ageg5yr" 
## [325] "age65yr"  "age80"    "age_g"    "htin4"    "htm4"     "wtkg3"   
## [331] "bmi5"     "bmi5cat"  "rfbmi5"   "chldcnt"  "educag"   "incomg1" 
## [337] "smoker3"  "rfsmok3"  "cureci2"  "drnkany6" "drocdy4"  "rfbing6" 
## [343] "drnkwk2"  "rfdrhv8"  "flshot7"  "pneumo3"  "aidtst4"  "rfseat2" 
## [349] "rfseat3"  "drnkdrv"

#Select variables
brfss_work<- brfss_data |>
  dplyr::select(physhlth, menthlth, bmi5, sexvar, educa, exerany2, addepev3, ageg5yr, incomg1, smoker3, genhlth, marital)

brfss_clean <- brfss_work |>
  mutate(
  physhlth_days = case_when(
    physhlth == 88 ~ 0,
    physhlth %in% c (77,99) ~ NA_real_,
    physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth)
  ),
menthlth_days = case_when(
  menthlth == 88 ~ 0,
  menthlth %in% c(77,99) ~ NA_real_,
  menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth)
),
 bmi = case_when(
      bmi5 == 9999 ~ NA_real_,
      TRUE ~ bmi5 / 100
    ),
sex= factor(
  sexvar,
  levels = c(1,2),
  labels = c("Male","Female")
),
exercise = factor(exerany2,
                  levels = c(2,1),
                  labels = c("No", "Yes")),
depression = factor(addepev3, 
                    levels = c(2,1),
                    labels = c("No", "Yes")),
age_group = case_when(
  ageg5yr %in% 1:3 ~ "18-34",
      ageg5yr %in% 4:6 ~ "35-49",
      ageg5yr %in% 7:9 ~ "50-64",
      ageg5yr %in% 10:13 ~ "65+",
      TRUE ~ NA_character_
),

smoking = case_when(
      smoker3 %in% 1:2 ~ "Current",
      smoker3 == 3 ~ "Former",
      smoker3 == 4 ~ "Never",
      TRUE ~ NA_character_
    ),
income = case_when(
      incomg1 %in% 1:2 ~ "Less than $25K",
      incomg1 %in% 3:4 ~ "$25K-$49K",
      incomg1 %in% 5:6 ~ "$50K-$99K",
      incomg1 == 7 ~ "$100K+",
      TRUE ~ NA_character_
    ),
    gen_health = case_when(
      genhlth %in% 1:2 ~ "Excellent/Very Good",
      genhlth == 3 ~ "Good",
      genhlth %in% 4:5 ~ "Fair/Poor",
      TRUE ~ NA_character_
    ),

    education = case_when(
      educa %in% 1:3 ~ "Less than HS",
      educa == 4 ~ "HS/GED",
      educa == 5 ~ "Some college",
      educa == 6 ~ "College graduate",
      TRUE ~ NA_character_
    ),
 marital = case_when(
      marital %in% c(1, 6) ~ "Married/Partnered",
      marital %in% 2:4 ~ "Previously married",
      marital == 5 ~ "Never married",
      TRUE ~ NA_character_
    )
) %>%
  mutate(
    # Set reference levels
    exercise = relevel(exercise, ref = "No"),
    depression = relevel(depression, ref = "No"),
    age_group = factor(age_group, levels = c("18-34", "35-49", "50-64", "65+")),
    smoking = factor(smoking, levels = c("Never", "Former", "Current")),
    gen_health = factor(gen_health, levels = c("Excellent/Very Good", "Good", "Fair/Poor")),
    education = factor(education, levels = c("Less than HS", "HS/GED", "Some college", "College graduate")),
    income = factor(income, levels = c("Less than $25K", "$25K-$49K", "$50K-$99K", "$100K+")),
    marital = factor(marital, levels = c("Married/Partnered", "Previously married", "Never married"))
  )

#Missingness before dropping NAs
missing_summary <- brfss_clean |>
  summarise(
    n_missing_physhlth = sum(is.na(physhlth_days)),
    pct_missing_physhlth = mean(is.na(physhlth_days)) * 100,
    
    n_missing_menthlth = sum(is.na(menthlth_days)),
    pct_missing_menthlth = mean(is.na(menthlth_days)) * 100,
    
    n_missing_bmi = sum(is.na(bmi)),
    pct_missing_bmi = mean(is.na(bmi)) * 100
  )

missing_summary

## # A tibble: 1 × 6
##   n_missing_physhlth pct_missing_physhlth n_missing_menthlth
##                <int>                <dbl>              <int>
## 1              10785                 2.49               8108
## # ℹ 3 more variables: pct_missing_menthlth <dbl>, n_missing_bmi <int>,
## #   pct_missing_bmi <dbl>

#remove NA and sample

set.seed(1220)
brfss_analytic <- brfss_clean %>%
  drop_na()%>%
  slice_sample(n = 8000)

#final analytic sample size
nrow(brfss_analytic)

## [1] 8000

brfss_analytic %>%
  dplyr::select(physhlth_days, menthlth_days, sex, exercise, depression, age_group, smoking, income, gen_health, education, marital) %>%
  tbl_summary(
    label = list(
      physhlth_days ~ "Physically Unhealthy Days",
      menthlth_days ~ "Mentally Unhealthy Days",
      sex ~ "Sex",
      exercise ~ "Any Physical Activity",
      depression ~ "Depressive disorder",
      age_group ~ "Age",
      smoking ~ "Smoking Status",
      income ~ "Income",
      gen_health ~ "General Health Status",
      education ~ "Education Level",
      marital ~ "Marital Status"
      
    ),
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1,
    missing = "no"
  ) %>%
  add_n() %>%

  bold_labels() %>%

  modify_caption("**Table 1. Descriptive Statistics - BRFSS 2023 Analytic Sample ( n =8,000)")

**Table 1. Descriptive Statistics - BRFSS 2023 Analytic Sample ( n =8,000)
Characteristic	N	N = 8,000¹
Physically Unhealthy Days	8,000	4.3 (8.7)
Mentally Unhealthy Days	8,000	4.3 (8.2)
Sex	8,000
Male		3,971 (50%)
Female		4,029 (50%)
Any Physical Activity	8,000	6,157 (77%)
Depressive disorder	8,000	1,776 (22%)
Age	8,000
18-34		1,294 (16%)
35-49		1,657 (21%)
50-64		2,109 (26%)
65+		2,940 (37%)
Smoking Status	8,000
Never		4,808 (60%)
Former		2,230 (28%)
Current		962 (12%)
Income	8,000
Less than $25K		1,090 (14%)
$25K-$49K		1,931 (24%)
$50K-$99K		4,297 (54%)
$100K+		682 (8.5%)
General Health Status	8,000
Excellent/Very Good		3,926 (49%)
Good		2,648 (33%)
Fair/Poor		1,426 (18%)
Education Level	8,000
Less than HS		391 (4.9%)
HS/GED		1,887 (24%)
Some college		2,115 (26%)
College graduate		3,607 (45%)
Marital Status	8,000
Married/Partnered		4,582 (57%)
Previously married		2,050 (26%)
Never married		1,368 (17%)
¹ Mean (SD); n (%)

Part 1: Model Selection for Linear Regression

max_mod <- lm(physhlth_days ~ menthlth_days + sex + exercise + depression + age_group + smoking + income + gen_health + education + marital, data = brfss_analytic)

tidy(max_mod, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Maximum Model: All Predictors",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI lower", "CI upper")
)|>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Maximum Model: All Predictors
Term	Estimate	SE	t	p-value	CI lower	CI upper
(Intercept)	1.6779	0.5148	3.2593	0.0011	0.6688	2.6871
menthlth_days	0.1839	0.0114	16.1977	0.0000	0.1616	0.2062
sexFemale	0.2350	0.1639	1.4336	0.1517	-0.0863	0.5563
exerciseYes	-1.9039	0.2031	-9.3729	0.0000	-2.3020	-1.5057
depressionYes	0.7573	0.2149	3.5234	0.0004	0.3360	1.1787
age_group35-49	0.7358	0.2827	2.6026	0.0093	0.1816	1.2901
age_group50-64	1.3674	0.2778	4.9222	0.0000	0.8228	1.9120
age_group65+	1.7255	0.2785	6.1951	0.0000	1.1795	2.2715
smokingFormer	0.4346	0.1878	2.3149	0.0206	0.0666	0.8027
smokingCurrent	0.4963	0.2653	1.8711	0.0614	-0.0237	1.0164
income$25K-$49K	-1.0835	0.2763	-3.9217	0.0001	-1.6251	-0.5419
income$50K-$99K	-1.5210	0.2761	-5.5092	0.0000	-2.0621	-0.9798
income$100K+	-1.2931	0.3979	-3.2495	0.0012	-2.0731	-0.5130
gen_healthGood	1.3654	0.1839	7.4264	0.0000	1.0050	1.7259
gen_healthFair/Poor	10.0022	0.2487	40.2182	0.0000	9.5147	10.4897
educationHS/GED	0.2889	0.4009	0.7206	0.4712	-0.4970	1.0748
educationSome college	0.9892	0.4027	2.4566	0.0140	0.1998	1.7785
educationCollege graduate	1.0547	0.4045	2.6075	0.0091	0.2618	1.8475
maritalPreviously married	-0.2536	0.2066	-1.2273	0.2197	-0.6586	0.1514
maritalNever married	-0.4054	0.2490	-1.6283	0.1035	-0.8934	0.0826

# Reporting R^2, adjusted R^2, AIC,BIC
glance(max_mod) |>
  dplyr::select(r.squared, adj.r.squared, sigma, AIC, BIC, df.residual) |>
  mutate(across(everything(), \(x) round(x, 3))) |>
  kable(caption = "Maximum Model: Fit Statistics") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Maximum Model: Fit Statistics
r.squared	adj.r.squared	sigma	AIC	BIC	df.residual
0.326	0.324	7.113	54115.19	54261.92	7980

This model explains about 32.6% of the variance in physically unhealthy days ($R^2$ = 0.326, $Adjusted R^2$ = 0.324). The AIC is 54,115.19 and BIC is 54,261.92, which provide baseline fit statistics for comparing alternative models.

best_subset <- regsubsets(physhlth_days ~ menthlth_days + sex + exercise + depression + age_group + smoking + income + gen_health + education + marital, data = brfss_analytic, nvmax = 19, method = "exhaustive")

best_subset_summary <- summary(best_subset)

subset_df <- tibble(
  p = 1:length(best_subset_summary$adjr2),
  `Adj. R²` = best_subset_summary$adjr2,
  BIC = best_subset_summary$bic,
  Cp = best_subset_summary$cp
)

#Ajust R-squared plot
plot1 <- ggplot(subset_df, aes(x = p, y = `Adj. R²`)) +
  geom_line(linewidth = 1, color = "steelblue") +
  geom_point(size = 3, color = "steelblue") +
  geom_vline(xintercept = which.max(best_subset_summary$adjr2),
             linetype = "dashed", color = "tomato") +
  labs(title = "Adjusted R² by Model Size", x = "Number of Variables", y = "Adjusted R²") +
  theme_minimal(base_size = 12)

#BIC plot
plot2 <- ggplot(subset_df, aes(x = p, y = BIC)) +
  geom_line(linewidth = 1, color = "steelblue") +
  geom_point(size = 3, color = "steelblue") +
  geom_vline(xintercept = which.min(best_subset_summary$bic),
             linetype = "dashed", color = "tomato") +
  labs(title = "BIC by Model Size", x = "Number of Variables", y = "BIC") +
  theme_minimal(base_size = 12)

gridExtra::grid.arrange(plot1, plot2, ncol = 2)

cat("Best model by Adj. R²:", which.max(best_subset_summary$adjr2), "variables\n")

## Best model by Adj. R²: 18 variables

cat("Best model by BIC:", which.min(best_subset_summary$bic), "variables\n")

## Best model by BIC: 9 variables

The two criteria do not agree on the best model size. $Adjusted R^2$ selected a 17-variable model because it continues to increase as more predictors are added. BIC selects a 10-variable model due to the stronger penalty for model complexity. BIC favors a simpler model.

#backward elimination
backward_mod <- step(max_mod, direction = "backward", trace =1)

## Start:  AIC=31410.17
## physhlth_days ~ menthlth_days + sex + exercise + depression + 
##     age_group + smoking + income + gen_health + education + marital
## 
##                 Df Sum of Sq    RSS   AIC
## - marital        2       171 403887 31410
## <none>                       403716 31410
## - sex            1       104 403820 31410
## - smoking        2       360 404077 31413
## - depression     1       628 404345 31421
## - education      3       876 404592 31422
## - income         3      1547 405263 31435
## - age_group      3      2242 405959 31448
## - exercise       1      4444 408161 31496
## - menthlth_days  1     13273 416990 31667
## - gen_health     2     85586 489303 32944
## 
## Step:  AIC=31409.55
## physhlth_days ~ menthlth_days + sex + exercise + depression + 
##     age_group + smoking + income + gen_health + education
## 
##                 Df Sum of Sq    RSS   AIC
## <none>                       403887 31410
## - sex            1       103 403990 31410
## - smoking        2       357 404244 31413
## - depression     1       612 404499 31420
## - education      3       875 404762 31421
## - income         3      1394 405281 31431
## - age_group      3      3049 406936 31464
## - exercise       1      4451 408338 31495
## - menthlth_days  1     13238 417125 31666
## - gen_health     2     85645 489532 32944

tidy(backward_mod, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(caption = "Backward Elimination Result (AIC-based)",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Backward Elimination Result (AIC-based)
Term	Estimate	SE	t	p-value	CI Lower	CI Upper
(Intercept)	1.3853	0.4874	2.8425	0.0045	0.4300	2.3407
menthlth_days	0.1836	0.0114	16.1748	0.0000	0.1614	0.2059
sexFemale	0.2329	0.1633	1.4262	0.1539	-0.0872	0.5530
exerciseYes	-1.9048	0.2031	-9.3793	0.0000	-2.3029	-1.5067
depressionYes	0.7471	0.2149	3.4769	0.0005	0.3259	1.1683
age_group35-49	0.8374	0.2703	3.0980	0.0020	0.3075	1.3672
age_group50-64	1.4778	0.2583	5.7206	0.0000	0.9714	1.9841
age_group65+	1.8366	0.2513	7.3099	0.0000	1.3441	2.3292
smokingFormer	0.4399	0.1876	2.3442	0.0191	0.0720	0.8077
smokingCurrent	0.4776	0.2647	1.8038	0.0713	-0.0414	0.9965
income$25K-$49K	-1.0324	0.2749	-3.7557	0.0002	-1.5713	-0.4936
income$50K-$99K	-1.3884	0.2658	-5.2239	0.0000	-1.9094	-0.8674
income$100K+	-1.1122	0.3849	-2.8897	0.0039	-1.8666	-0.3577
gen_healthGood	1.3636	0.1839	7.4161	0.0000	1.0032	1.7241
gen_healthFair/Poor	10.0009	0.2486	40.2245	0.0000	9.5136	10.4883
educationHS/GED	0.2576	0.4006	0.6431	0.5202	-0.5276	1.0428
educationSome college	0.9636	0.4025	2.3943	0.0167	0.1747	1.7525
educationCollege graduate	1.0310	0.4043	2.5502	0.0108	0.2385	1.8236

#Forward selection
null_mod <- lm(physhlth_days ~ 1, data = brfss_analytic)

forward_mod <- step(max_mod, scope = list(lower = null_mod, upper = max_mod), direction = "forward", trace = 1)

## Start:  AIC=31410.17
## physhlth_days ~ menthlth_days + sex + exercise + depression + 
##     age_group + smoking + income + gen_health + education + marital

tidy(forward_mod, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x,4))) |>
  kable(
    caption = "Forward Selection Result (AIC-based)",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Forward Selection Result (AIC-based)
Term	Estimate	SE	t	p-value	CI Lower	CI Upper
(Intercept)	1.6779	0.5148	3.2593	0.0011	0.6688	2.6871
menthlth_days	0.1839	0.0114	16.1977	0.0000	0.1616	0.2062
sexFemale	0.2350	0.1639	1.4336	0.1517	-0.0863	0.5563
exerciseYes	-1.9039	0.2031	-9.3729	0.0000	-2.3020	-1.5057
depressionYes	0.7573	0.2149	3.5234	0.0004	0.3360	1.1787
age_group35-49	0.7358	0.2827	2.6026	0.0093	0.1816	1.2901
age_group50-64	1.3674	0.2778	4.9222	0.0000	0.8228	1.9120
age_group65+	1.7255	0.2785	6.1951	0.0000	1.1795	2.2715
smokingFormer	0.4346	0.1878	2.3149	0.0206	0.0666	0.8027
smokingCurrent	0.4963	0.2653	1.8711	0.0614	-0.0237	1.0164
income$25K-$49K	-1.0835	0.2763	-3.9217	0.0001	-1.6251	-0.5419
income$50K-$99K	-1.5210	0.2761	-5.5092	0.0000	-2.0621	-0.9798
income$100K+	-1.2931	0.3979	-3.2495	0.0012	-2.0731	-0.5130
gen_healthGood	1.3654	0.1839	7.4264	0.0000	1.0050	1.7259
gen_healthFair/Poor	10.0022	0.2487	40.2182	0.0000	9.5147	10.4897
educationHS/GED	0.2889	0.4009	0.7206	0.4712	-0.4970	1.0748
educationSome college	0.9892	0.4027	2.4566	0.0140	0.1998	1.7785
educationCollege graduate	1.0547	0.4045	2.6075	0.0091	0.2618	1.8475
maritalPreviously married	-0.2536	0.2066	-1.2273	0.2197	-0.6586	0.1514
maritalNever married	-0.4054	0.2490	-1.6283	0.1035	-0.8934	0.0826

#Stepwise selection
stepwise_mod <- step(null_mod, scope = list(lower = null_mod, upper= max_mod), direction = "both", trace = 1)

## Start:  AIC=34524.38
## physhlth_days ~ 1
## 
##                 Df Sum of Sq    RSS   AIC
## + gen_health     2    164022 434665 31967
## + menthlth_days  1     60303 538384 33677
## + exercise       1     34542 564145 34051
## + income         3     28842 569845 34135
## + depression     1     20750 577937 34244
## + smoking        2     12790 585897 34356
## + education      3      8593 590094 34415
## + marital        2      7321 591366 34430
## + age_group      3      6527 592160 34443
## + sex            1      1649 597038 34504
## <none>                       598687 34524
## 
## Step:  AIC=31967.07
## physhlth_days ~ gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## + menthlth_days  1     17940 416725 31632
## + exercise       1      6467 428198 31849
## + depression     1      6070 428595 31857
## + income         3      3231 431434 31913
## + smoking        2      1776 432889 31938
## + age_group      3      1536 433129 31945
## + sex            1      1254 433411 31946
## + marital        2       830 433835 31956
## + education      3       416 434250 31965
## <none>                       434665 31967
## - gen_health     2    164022 598687 34524
## 
## Step:  AIC=31631.88
## physhlth_days ~ gen_health + menthlth_days
## 
##                 Df Sum of Sq    RSS   AIC
## + exercise       1      5821 410904 31521
## + age_group      3      4661 412064 31548
## + income         3      1987 414738 31600
## + marital        2      1402 415322 31609
## + smoking        2      1035 415690 31616
## + depression     1       738 415987 31620
## + sex            1       605 416120 31622
## + education      3       319 416406 31632
## <none>                       416725 31632
## - menthlth_days  1     17940 434665 31967
## - gen_health     2    121660 538384 33677
## 
## Step:  AIC=31521.35
## physhlth_days ~ gen_health + menthlth_days + exercise
## 
##                 Df Sum of Sq    RSS   AIC
## + age_group      3      3720 407184 31455
## + income         3      1198 409706 31504
## + marital        2      1065 409839 31505
## + depression     1       739 410165 31509
## + smoking        2       706 410198 31512
## + education      3       687 410217 31514
## + sex            1       449 410455 31515
## <none>                       410904 31521
## - exercise       1      5821 416725 31632
## - menthlth_days  1     17294 428198 31849
## - gen_health     2    101805 512709 33288
## 
## Step:  AIC=31454.58
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
## 
##                 Df Sum of Sq    RSS   AIC
## + income         3      1163 406021 31438
## + depression     1       933 406251 31438
## + education      3       502 406682 31451
## + sex            1       261 406923 31451
## + smoking        2       361 406823 31451
## <none>                       407184 31455
## + marital        2        40 407144 31458
## - age_group      3      3720 410904 31521
## - exercise       1      4880 412064 31548
## - menthlth_days  1     19957 427141 31835
## - gen_health     2     94875 502059 33126
## 
## Step:  AIC=31437.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income
## 
##                 Df Sum of Sq    RSS   AIC
## + depression     1       862 405159 31423
## + education      3       954 405067 31425
## + smoking        2       305 405716 31436
## + sex            1       203 405818 31436
## <none>                       406021 31438
## + marital        2       154 405867 31439
## - income         3      1163 407184 31455
## - age_group      3      3685 409706 31504
## - exercise       1      4214 410235 31518
## - menthlth_days  1     18945 424966 31801
## - gen_health     2     87509 493530 32995
## 
## Step:  AIC=31422.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression
## 
##                 Df Sum of Sq    RSS   AIC
## + education      3       837 404322 31412
## + smoking        2       259 404900 31422
## + sex            1       110 405049 31423
## <none>                       405159 31423
## + marital        2       170 404990 31423
## - depression     1       862 406021 31438
## - income         3      1092 406251 31438
## - age_group      3      3871 409030 31493
## - exercise       1      4219 409378 31504
## - menthlth_days  1     13674 418834 31686
## - gen_health     2     86610 491770 32969
## 
## Step:  AIC=31412.16
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education
## 
##                 Df Sum of Sq    RSS   AIC
## + smoking        2       332 403990 31410
## <none>                       404322 31412
## + sex            1        78 404244 31413
## + marital        2       168 404154 31413
## - education      3       837 405159 31423
## - depression     1       745 405067 31425
## - income         3      1495 405818 31436
## - age_group      3      3558 407880 31476
## - exercise       1      4604 408926 31501
## - menthlth_days  1     13607 417929 31675
## - gen_health     2     86813 491135 32964
## 
## Step:  AIC=31409.59
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking
## 
##                 Df Sum of Sq    RSS   AIC
## + sex            1       103 403887 31410
## <none>                       403990 31410
## + marital        2       170 403820 31410
## - smoking        2       332 404322 31412
## - depression     1       691 404681 31421
## - education      3       910 404900 31422
## - income         3      1457 405447 31432
## - age_group      3      3178 407168 31466
## - exercise       1      4502 408492 31496
## - menthlth_days  1     13347 417337 31668
## - gen_health     2     85552 489542 32942
## 
## Step:  AIC=31409.55
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking + sex
## 
##                 Df Sum of Sq    RSS   AIC
## <none>                       403887 31410
## - sex            1       103 403990 31410
## + marital        2       171 403716 31410
## - smoking        2       357 404244 31413
## - depression     1       612 404499 31420
## - education      3       875 404762 31421
## - income         3      1394 405281 31431
## - age_group      3      3049 406936 31464
## - exercise       1      4451 408338 31495
## - menthlth_days  1     13238 417125 31666
## - gen_health     2     85645 489532 32944

tidy(stepwise_mod, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Stepwise Selection Result (AIC-based)",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Stepwise Selection Result (AIC-based)
Term	Estimate	SE	t	p-value	CI Lower	CI Upper
(Intercept)	1.3853	0.4874	2.8425	0.0045	0.4300	2.3407
gen_healthGood	1.3636	0.1839	7.4161	0.0000	1.0032	1.7241
gen_healthFair/Poor	10.0009	0.2486	40.2245	0.0000	9.5136	10.4883
menthlth_days	0.1836	0.0114	16.1748	0.0000	0.1614	0.2059
exerciseYes	-1.9048	0.2031	-9.3793	0.0000	-2.3029	-1.5067
age_group35-49	0.8374	0.2703	3.0980	0.0020	0.3075	1.3672
age_group50-64	1.4778	0.2583	5.7206	0.0000	0.9714	1.9841
age_group65+	1.8366	0.2513	7.3099	0.0000	1.3441	2.3292
income$25K-$49K	-1.0324	0.2749	-3.7557	0.0002	-1.5713	-0.4936
income$50K-$99K	-1.3884	0.2658	-5.2239	0.0000	-1.9094	-0.8674
income$100K+	-1.1122	0.3849	-2.8897	0.0039	-1.8666	-0.3577
depressionYes	0.7471	0.2149	3.4769	0.0005	0.3259	1.1683
educationHS/GED	0.2576	0.4006	0.6431	0.5202	-0.5276	1.0428
educationSome college	0.9636	0.4025	2.3943	0.0167	0.1747	1.7525
educationCollege graduate	1.0310	0.4043	2.5502	0.0108	0.2385	1.8236
smokingFormer	0.4399	0.1876	2.3442	0.0191	0.0720	0.8077
smokingCurrent	0.4776	0.2647	1.8038	0.0713	-0.0414	0.9965
sexFemale	0.2329	0.1633	1.4262	0.1539	-0.0872	0.5530

The three AIC based methods did not all select the same final model. Backward elimination and stepwise selection agreed on a 17-variable model, while forward selection retains all predictors from the full model. Several predictors are retained by all methods including menthlth_days, exercise, depression, age_group, gen_health, smoking, education, and sex.

Step 4: Final Model Selection

I selected the stepwise AIC model as my final model. Both backward elimination and stepwise selection agreed on the same 17-predictor model, suggesting a stable solution.

tidy(stepwise_mod, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Final Model: Stepwise Selection  (AIC-based)",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Final Model: Stepwise Selection (AIC-based)
Term	Estimate	SE	t	p-value	CI Lower	CI Upper
(Intercept)	1.3853	0.4874	2.8425	0.0045	0.4300	2.3407
gen_healthGood	1.3636	0.1839	7.4161	0.0000	1.0032	1.7241
gen_healthFair/Poor	10.0009	0.2486	40.2245	0.0000	9.5136	10.4883
menthlth_days	0.1836	0.0114	16.1748	0.0000	0.1614	0.2059
exerciseYes	-1.9048	0.2031	-9.3793	0.0000	-2.3029	-1.5067
age_group35-49	0.8374	0.2703	3.0980	0.0020	0.3075	1.3672
age_group50-64	1.4778	0.2583	5.7206	0.0000	0.9714	1.9841
age_group65+	1.8366	0.2513	7.3099	0.0000	1.3441	2.3292
income$25K-$49K	-1.0324	0.2749	-3.7557	0.0002	-1.5713	-0.4936
income$50K-$99K	-1.3884	0.2658	-5.2239	0.0000	-1.9094	-0.8674
income$100K+	-1.1122	0.3849	-2.8897	0.0039	-1.8666	-0.3577
depressionYes	0.7471	0.2149	3.4769	0.0005	0.3259	1.1683
educationHS/GED	0.2576	0.4006	0.6431	0.5202	-0.5276	1.0428
educationSome college	0.9636	0.4025	2.3943	0.0167	0.1747	1.7525
educationCollege graduate	1.0310	0.4043	2.5502	0.0108	0.2385	1.8236
smokingFormer	0.4399	0.1876	2.3442	0.0191	0.0720	0.8077
smokingCurrent	0.4776	0.2647	1.8038	0.0713	-0.0414	0.9965
sexFemale	0.2329	0.1633	1.4262	0.1539	-0.0872	0.5530

Holding all other variables constant, each additional mentally unhealthy day is associated with 0.18 more physically unhealthy days. Individuals reporting any exercise in the past 30 days had an average of 1.90 fewer physically unhealthy days compared to those who did not exercise. Adults reporting fair or poor general health experienced about 10 additional physically unhealthy days per month compared to those in excellent or very good health.

par(mfrow = c(2,2))
plot(stepwise_mod)

par(mfrow = c(1,1))

Residuals vs Fitted: The residuals show a curved pattern and the spread increases at higher fitted values, suggesting non-linearity and non-constant variance.

Q-Q Plot: The middle of the distribution follows the reference line reasonably well. The tails strongly deviate.

Scale-Location: The red line slopes upward and the residual spread increases with fitted values, illustrating a fan shape, confirming heteroscedasticity.

Residuals vs Leverage: No points exceed Cook’s distance.

The LINE assumptions are partially satisfied. The residuals plot showed non-linearity, heteroscedasticity, and non-normal tails, suggesting the model fit could improve.

Part 2: Logistic Regression

brfss_analytic <- brfss_analytic |>
  mutate(frequent_distress = ifelse(physhlth_days >= 14, "Yes", "No"),
                                    frequent_distress = factor(frequent_distress, levels = c("No", "Yes")))

tibble(Metric = c("Observations", "Variables", "Frequent Distress Cases", "Frequent Distress Prevalence"),
       Value  = c(nrow(brfss_analytic), ncol(brfss_analytic),
                  sum(brfss_analytic$frequent_distress == "Yes"),
                  paste0(round(100 * mean(brfss_analytic$frequent_distress == "Yes"), 1), "%"))) |>
  kable(caption = "Frequent Distress Prevalence") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Frequent Distress Prevalence
Metric	Value
Observations	8000
Variables	24
Frequent Distress Cases	1052
Frequent Distress Prevalence	13.2%

Frequent physical distress is relatively uncommon in this sample, with only 13.2% of adults meeting the threshold. This imbalance means the outcome is skewed toward the “No” category.

simple_mod <- glm(frequent_distress ~ gen_health, data = brfss_analytic, family = binomial)

tidy(simple_mod, conf.int = TRUE, exponentiate = FALSE) |>
  kable(digits = 3, caption = "Simple Logistic Regression: Frequent Physical Distress ~ General Health Status (Log-Odds Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Simple Logistic Regression: Frequent Physical Distress ~ General Health Status (Log-Odds Scale)
term	estimate	std.error	statistic	conf.low	conf.high
(Intercept)	-3.398	0.090	-37.672	-3.580	-3.226
gen_healthGood	1.142	0.112	10.197	0.924	1.364
gen_healthFair/Poor	3.289	0.105	31.428	3.087	3.498

exp(coef(simple_mod))

##         (Intercept)      gen_healthGood gen_healthFair/Poor 
##          0.03342985          3.13235705         26.81066762

exp(confint(simple_mod))

## Waiting for profiling to be done...

##                           2.5 %      97.5 %
## (Intercept)          0.02787183  0.03970661
## gen_healthGood       2.52057996  3.91109619
## gen_healthFair/Poor 21.91471664 33.03776143

Odds Ratios Gen_health Good: OR = 3.13 95%CI: 2.52 - 3.91 Gen_health Fair/Poor: OR = 26.81 95% CI: 21.91 - 33.04

Compared to adults reporting excellent or very good general health, those reporting good health have 3.13 times the odds of experiencing frequent physical distress. Adults reporting fair or poor health have 26.81 times the odds of frequent physical distress. Both predictors are statistically significant since the confidence intervals exclude 1.

mod_logistic <- glm(
  frequent_distress ~ gen_health + menthlth_days + exercise +
    age_group + income + depression + education +
    smoking + sex,
  data = brfss_analytic,
  family = binomial
)

tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
  kable(digits = 3, caption = "Multiple Logistic Regression") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Multiple Logistic Regression
term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	0.021	0.237	-16.298	0.000	0.013	0.033
gen_healthGood	2.420	0.116	7.644	0.000	1.933	3.042
gen_healthFair/Poor	14.535	0.114	23.580	0.000	11.670	18.215
menthlth_days	1.046	0.004	10.298	0.000	1.037	1.055
exerciseYes	0.561	0.085	-6.794	0.000	0.475	0.663
age_group35-49	1.685	0.158	3.305	0.001	1.240	2.304
age_group50-64	2.172	0.147	5.287	0.000	1.635	2.908
age_group65+	2.664	0.144	6.800	0.000	2.017	3.551
income$25K-$49K	0.827	0.111	-1.723	0.085	0.666	1.027
income$50K-$99K	0.645	0.111	-3.946	0.000	0.519	0.802
income$100K+	0.637	0.221	-2.042	0.041	0.408	0.971
depressionYes	1.295	0.096	2.699	0.007	1.072	1.560
educationHS/GED	1.035	0.165	0.210	0.834	0.751	1.434
educationSome college	1.371	0.165	1.914	0.056	0.995	1.899
educationCollege graduate	1.306	0.171	1.561	0.119	0.937	1.832
smokingFormer	1.221	0.090	2.228	0.026	1.024	1.455
smokingCurrent	1.228	0.115	1.784	0.075	0.979	1.536
sexFemale	1.182	0.081	2.073	0.038	1.009	1.386

Holding all other variables constant, each additional mentally unhealthy day is associated with a 4.6% increase in the odds of frequent physical distress (adjusted odds ratio = 1.046, 95% CI: 1.037 - 1.055). Adults reporting fair or poor general health have 14.5 times the odds of frequent physical distress compared to those reporting excellent or very good health (adjusted odds ratio = 14.535, 95% CI:11.670 - 18.214). Adults who engaged in exercise in the past 30 days have 44% lower odds of frequent physical distress compared to those who did not exercise.

#reduced model (smoking removed)
reduced_mod <- glm(
  frequent_distress ~ gen_health + menthlth_days + exercise +
    age_group + depression + education +
    smoking + sex,
  data = brfss_analytic,
  family = binomial
)

anova(reduced_mod, mod_logistic, test = "Chisq") |>
  kable(digits = 3,
        caption = "LR Test: Does adding  smoker improve the model?") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

LR Test: Does adding smoker improve the model?
Resid. Df	Resid. Dev	Df	Deviance	Pr(>Chi)
7985	4440.376	NA	NA	NA
7982	4423.888	3	16.488	0.001

H0: The smoking variables do not improve the model fit. All smoking coefficient s= 0.

Ha: At least one smoking coefficient is not zero.

$Chi^2$ = 16.488 Degrees of freedom = 3 p-value = 0.001

Since the p-value is 0.001, which is below 0.05, we reject the null hypothesis. Adding the smoking variable significantly improves the model.

roc_obj <- roc(brfss_analytic$frequent_distress, fitted(mod_logistic))

## Setting levels: control = No, case = Yes

## Setting direction: controls < cases

plot(roc_obj, main = "ROC Curve: Frequent Physical Distress Model",
 print.auc = TRUE)

auc(roc_obj)

## Area under the curve: 0.8555

ci.auc(roc_obj)

## 95% CI: 0.8427-0.8683 (DeLong)

The AUC value is 0.8555. This indicates excellent discrimination between individuals with and without frequent physical distress.

hoslem.test(mod_logistic$y, fitted(mod_logistic), g = 10)

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  mod_logistic$y, fitted(mod_logistic)
## X-squared = 6.8688, df = 8, p-value = 0.5509

H0: The logistic regression model fits the data well, no evidence of poor fit

Ha: the model shows evidence of poor fit.

$X^2$ = 6.8688 df = 8 p-value = 0.5509

With a p-value greater than 0.05, we fail to reject the null hypothesis. There is no evidence of poor model fit. The ROC/AUC of 0.8555 showed the model has excellent discrimination, meaning it can effectively distinguish individuals with and without frequent physical distress. The Hosmer-Lemeshow test adds that the model has good calibration, meaning the predicted probabilities match the observed outcomes well.

Part 3: Reflection The results of the regression model point out consistent factors associated with the burden of physical health among US adults. General health status emerged as the strongest predictor. Individuals reporting fair or poor health had substantially higher physically unhealthy days and more than 14 times the odds of frequent physical distress in the logistic model. Some predictors such as specific education and income categories were not statistically significant in the logistic model, despite showing small associations in the linear model. A key limitation of the analysis is the cross-sectional nature of BRFSS data, which prevents establishing causality. Unmeasured confounders such as chronic disease severity and access to health care could bias associations and were not included in the models. Comparing the two modeling approaches highlights how each provides distinct but complementary insights. The linear regression model quantifies how predictors relate to the number of physically unhealthy days, offering a continuous measure of burden. In contrast, the logistic regression model focuses on the probability of crossing a clinically meaningful threshold of frequent distress, which is often more interpretable for public health decision making. Linear regression emphasizes variance explained through r-squared, while logistic regression emphasizes discrimination and calibration through measures such as AUC. The two approaches provide a better understanding of the determinants of physical distress in the population. AI was used during the assignment to help understand coding errors and ways to resolve them based on the warnings. I would include the warning from the console and ask what it meant and/or ways to fix the problem.

HW 4

Sarah Andres

2026-04-14