library(tidyverse)
library(haven)
library(janitor)
library(knitr)
library(kableExtra)
library(broom)
library(gtsummary)
library(car)
library(leaps)
library(MASS)
library(ggeffects)
library (dplyr)
options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))
# 1. Import raw data
brfss_raw <- read_xpt("C:/Users/tahia/OneDrive/Desktop/UAlbany PhD/Epi 553/LLCP2023XPT/LLCP2023.XPT") |>
clean_names()
names(brfss_raw)
## [1] "state" "fmonth" "idate" "imonth" "iday" "iyear"
## [7] "dispcode" "seqno" "psu" "ctelenm1" "pvtresd1" "colghous"
## [13] "statere1" "celphon1" "ladult1" "numadult" "respslc1" "landsex2"
## [19] "lndsxbrt" "safetime" "ctelnum1" "cellfon5" "cadult1" "cellsex2"
## [25] "celsxbrt" "pvtresd3" "cclghous" "cstate1" "landline" "hhadult"
## [31] "sexvar" "genhlth" "physhlth" "menthlth" "poorhlth" "primins1"
## [37] "persdoc3" "medcost1" "checkup1" "exerany2" "exract12" "exeroft1"
## [43] "exerhmm1" "exract22" "exeroft2" "exerhmm2" "strength" "bphigh6"
## [49] "bpmeds1" "cholchk3" "toldhi3" "cholmed3" "cvdinfr4" "cvdcrhd4"
## [55] "cvdstrk3" "asthma3" "asthnow" "chcscnc1" "chcocnc1" "chccopd3"
## [61] "addepev3" "chckdny2" "havarth4" "diabete4" "diabage4" "marital"
## [67] "educa" "renthom1" "numhhol4" "numphon4" "cpdemo1c" "veteran3"
## [73] "employ1" "children" "income3" "pregnant" "weight2" "height3"
## [79] "deaf" "blind" "decide" "diffwalk" "diffdres" "diffalon"
## [85] "fall12mn" "fallinj5" "smoke100" "smokday2" "usenow3" "ecignow2"
## [91] "alcday4" "avedrnk3" "drnk3ge5" "maxdrnks" "flushot7" "flshtmy3"
## [97] "pneuvac4" "shingle2" "hivtst7" "hivtstd3" "seatbelt" "drnkdri2"
## [103] "covidpo1" "covidsm1" "covidact" "pdiabts1" "prediab2" "diabtype"
## [109] "insulin1" "chkhemo3" "eyeexam1" "diabeye1" "diabedu1" "feetsore"
## [115] "arthexer" "arthedu" "lmtjoin3" "arthdis2" "joinpai2" "lcsfirst"
## [121] "lcslast" "lcsnumcg" "lcsctsc1" "lcsscncr" "lcsctwhn" "hadmam"
## [127] "howlong" "cervscrn" "crvclcnc" "crvclpap" "crvclhpv" "hadhyst2"
## [133] "psatest1" "psatime1" "pcpsars2" "psasugs1" "pcstalk2" "hadsigm4"
## [139] "colnsigm" "colntes1" "sigmtes1" "lastsig4" "colncncr" "vircolo1"
## [145] "vclntes2" "smalstol" "stoltest" "stooldn2" "bldstfit" "sdnates1"
## [151] "cncrdiff" "cncrage" "cncrtyp2" "csrvtrt3" "csrvdoc1" "csrvsum"
## [157] "csrvrtrn" "csrvinst" "csrvinsr" "csrvdein" "csrvclin" "csrvpain"
## [163] "csrvctl2" "indortan" "numburn3" "sunprtct" "wkdayout" "wkendout"
## [169] "cimemlo1" "cdworry" "cddiscu1" "cdhous1" "cdsocia1" "caregiv1"
## [175] "crgvrel4" "crgvlng1" "crgvhrs1" "crgvprb3" "crgvalzd" "crgvper1"
## [181] "crgvhou1" "crgvexpt" "lastsmk2" "stopsmk2" "mentcigs" "mentecig"
## [187] "heattbco" "firearm5" "gunload" "loadulk2" "hasymp1" "hasymp2"
## [193] "hasymp3" "hasymp4" "hasymp5" "hasymp6" "strsymp1" "strsymp2"
## [199] "strsymp3" "strsymp4" "strsymp5" "strsymp6" "firstaid" "aspirin"
## [205] "birthsex" "somale" "sofemale" "trnsgndr" "marijan1" "marjsmok"
## [211] "marjeat" "marjvape" "marjdab" "marjothr" "usemrjn4" "acedeprs"
## [217] "acedrink" "acedrugs" "aceprisn" "acedivrc" "acepunch" "acehurt1"
## [223] "aceswear" "acetouch" "acetthem" "acehvsex" "aceadsaf" "aceadned"
## [229] "imfvpla4" "hpvadvc4" "hpvadsht" "tetanus1" "covidva1" "covacge1"
## [235] "covidnu2" "lsatisfy" "emtsuprt" "sdlonely" "sdhemply" "foodstmp"
## [241] "sdhfood1" "sdhbills" "sdhutils" "sdhtrnsp" "sdhstre1" "rrclass3"
## [247] "rrcognt2" "rrtreat" "rratwrk2" "rrhcare4" "rrphysm2" "rcsgend1"
## [253] "rcsxbrth" "rcsrltn2" "casthdx2" "casthno2" "qstver" "qstlang"
## [259] "metstat" "urbstat" "mscode" "ststr" "strwt" "rawrake"
## [265] "wt2rake" "imprace" "chispnc" "crace1" "cageg" "cllcpwt"
## [271] "dualuse" "dualcor" "llcpwt2" "llcpwt" "rfhlth" "phys14d"
## [277] "ment14d" "hlthpl1" "hcvu653" "totinda" "metvl12" "metvl22"
## [283] "maxvo21" "fc601" "actin13" "actin23" "padur1" "padur2"
## [289] "pafreq1" "pafreq2" "minac12" "minac22" "strfreq" "pamiss3"
## [295] "pamin13" "pamin23" "pa3min" "pavig13" "pavig23" "pa3vigm"
## [301] "pacat3" "paindx3" "pa150r4" "pa300r4" "pa30023" "pastrng"
## [307] "parec3" "pastae3" "rfhype6" "cholch3" "rfchol3" "michd"
## [313] "ltasth1" "casthm1" "asthms1" "drdxar2" "mrace1" "hispanc"
## [319] "race" "raceg21" "racegr3" "raceprv" "sex" "ageg5yr"
## [325] "age65yr" "age80" "age_g" "htin4" "htm4" "wtkg3"
## [331] "bmi5" "bmi5cat" "rfbmi5" "chldcnt" "educag" "incomg1"
## [337] "smoker3" "rfsmok3" "cureci2" "drnkany6" "drocdy4" "rfbing6"
## [343] "drnkwk2" "rfdrhv8" "flshot7" "pneumo3" "aidtst4" "rfseat2"
## [349] "rfseat3" "drnkdrv"
# Report raw dimensions
tibble(
Metric = c("Rows", "Columns"),
Value = c(nrow(brfss_raw), ncol(brfss_raw))
) |>
kable(caption = "Raw BRFSS 2023 Dataset Dimensions") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| Metric | Value |
|---|---|
| Rows | 433323 |
| Columns | 350 |
# 2. Select needed variables
brfss_clean <- brfss_raw |>
dplyr::select(menthlth, physhlth, bmi5, sexvar, exerany2, ageg5yr, incomg1, educa, smoker3)
# 3. Recode variables
brfss_clean <- brfss_clean |>
mutate(
menthlth_days = case_when(
menthlth == 88 ~ 0,
menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
menthlth %in% c(77, 99) ~ NA_real_,
TRUE ~ NA_real_
),
physhlth_days = case_when(
physhlth == 88 ~ 0,
physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
physhlth %in% c(77, 99) ~ NA_real_,
TRUE ~ NA_real_
),
bmi = case_when(
bmi5 == 9999 ~ NA_real_,
bmi5 > 0 ~ as.numeric(bmi5) / 100,
TRUE ~ NA_real_
),
sex = factor(
case_when(
sexvar == 1 ~ "Male",
sexvar == 2 ~ "Female",
TRUE ~ NA_character_
),
levels = c("Male", "Female")
),
exercise = factor(
case_when(
exerany2 == 1 ~ "Yes",
exerany2 == 2 ~ "No",
exerany2 %in% c(7, 9) ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("No", "Yes")
),
age_group = factor(
case_when(
ageg5yr == 1 ~ "18-24",
ageg5yr == 2 ~ "25-29",
ageg5yr == 3 ~ "30-34",
ageg5yr == 4 ~ "35-39",
ageg5yr == 5 ~ "40-44",
ageg5yr == 6 ~ "45-49",
ageg5yr == 7 ~ "50-54",
ageg5yr == 8 ~ "55-59",
ageg5yr == 9 ~ "60-64",
ageg5yr == 10 ~ "65-69",
ageg5yr == 11 ~ "70-74",
ageg5yr == 12 ~ "75-79",
ageg5yr == 13 ~ "80+",
ageg5yr == 14 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("18-24", "25-29", "30-34", "35-39", "40-44",
"45-49", "50-54", "55-59", "60-64", "65-69",
"70-74", "75-79", "80+")
),
income = factor(
case_when(
incomg1 == 1 ~ "Less than $15,000",
incomg1 == 2 ~ "$15,000 to < $25,000",
incomg1 == 3 ~ "$25,000 to < $35,000",
incomg1 == 4 ~ "$35,000 to < $50,000",
incomg1 == 5 ~ "$50,000 to < $100,000",
incomg1 == 6 ~ "$100,000 to < $200,000",
incomg1 == 7 ~ "$200,000 or more",
incomg1 == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Less than $15,000",
"$15,000 to < $25,000",
"$25,000 to < $35,000",
"$35,000 to < $50,000",
"$50,000 to < $100,000",
"$100,000 to < $200,000",
"$200,000 or more")
),
education = factor(
case_when(
educa %in% c(1, 2) ~ "Less than high school",
educa == 3 ~ "High school diploma or GED",
educa == 4 ~ "Some college or technical school",
educa == 5 ~ "College graduate",
educa == 6 ~ "Graduate or professional degree",
educa == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Less than high school",
"High school diploma or GED",
"Some college or technical school",
"College graduate",
"Graduate or professional degree")
),
smoking = factor(
case_when(
smoker3 == 1 ~ "Current daily smoker",
smoker3 == 2 ~ "Current some-day smoker",
smoker3 == 3 ~ "Former smoker",
smoker3 == 4 ~ "Never smoker",
smoker3 == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Current daily smoker",
"Current some-day smoker",
"Former smoker",
"Never smoker")
)
) |>
dplyr::select(menthlth_days, physhlth_days, bmi, sex, exercise,
age_group, income, education, smoking)
# 4. Missingness before dropping NA
missing_table <- brfss_clean |>
summarise(
menthlth_n_missing = sum(is.na(menthlth_days)),
menthlth_pct_missing = mean(is.na(menthlth_days)) * 100,
bmi_n_missing = sum(is.na(bmi)),
bmi_pct_missing = mean(is.na(bmi)) * 100,
smoking_n_missing = sum(is.na(smoking)),
smoking_pct_missing = mean(is.na(smoking)) * 100
)
missing_table |>
kable(caption = "Missing Values Before Creating the Analytic Sample", digits = 2) |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| menthlth_n_missing | menthlth_pct_missing | bmi_n_missing | bmi_pct_missing | smoking_n_missing | smoking_pct_missing |
|---|---|---|---|---|---|
| 8108 | 1.87 | 40535 | 9.35 | 23062 | 5.32 |
# 5. Create analytic dataset
set.seed(1220)
brfss_analytic <- brfss_clean |>
drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
age_group, income, education, smoking) |>
slice_sample(n = 8000)
saveRDS(brfss_clean,
"C:/Users/tahia/OneDrive/Desktop/UAlbany PhD/Epi 553/HW/HW3/brfss_clean_2023.rds")
# Report final analytic n
tibble(
Metric = c("Observations", "Variables"),
Value = c(nrow(brfss_analytic), ncol(brfss_analytic))
) |>
kable(caption = "Analytic Dataset Dimensions") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| Metric | Value |
|---|---|
| Observations | 8000 |
| Variables | 9 |
# 6. Descriptive table stratified by sex
tbl1 <- brfss_analytic |>
tbl_summary(
by = sex,
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 2
) |>
add_p() |>
bold_labels()
tbl1
| Characteristic | Male N = 3,9361 |
Female N = 4,0641 |
p-value2 |
|---|---|---|---|
| menthlth_days | 3.53 (7.53) | 5.08 (8.60) | <0.001 |
| physhlth_days | 3.98 (8.41) | 4.86 (8.94) | <0.001 |
| bmi | 28.71 (5.97) | 28.69 (6.98) | 0.008 |
| exercise | 3,146 (80%) | 3,094 (76%) | <0.001 |
| age_group | <0.001 | ||
| 18-24 | 235 (6.0%) | 171 (4.2%) | |
| 25-29 | 219 (5.6%) | 189 (4.7%) | |
| 30-34 | 253 (6.4%) | 210 (5.2%) | |
| 35-39 | 263 (6.7%) | 302 (7.4%) | |
| 40-44 | 290 (7.4%) | 292 (7.2%) | |
| 45-49 | 266 (6.8%) | 252 (6.2%) | |
| 50-54 | 305 (7.7%) | 303 (7.5%) | |
| 55-59 | 308 (7.8%) | 352 (8.7%) | |
| 60-64 | 408 (10%) | 379 (9.3%) | |
| 65-69 | 418 (11%) | 483 (12%) | |
| 70-74 | 382 (9.7%) | 426 (10%) | |
| 75-79 | 325 (8.3%) | 338 (8.3%) | |
| 80+ | 264 (6.7%) | 367 (9.0%) | |
| income | <0.001 | ||
| Less than $15,000 | 160 (4.1%) | 247 (6.1%) | |
| $15,000 to < $25,000 | 271 (6.9%) | 370 (9.1%) | |
| $25,000 to < $35,000 | 376 (9.6%) | 495 (12%) | |
| $35,000 to < $50,000 | 482 (12%) | 585 (14%) | |
| $50,000 to < $100,000 | 1,251 (32%) | 1,260 (31%) | |
| $100,000 to < $200,000 | 996 (25%) | 869 (21%) | |
| $200,000 or more | 400 (10%) | 238 (5.9%) | |
| education | 0.003 | ||
| Less than high school | 75 (1.9%) | 49 (1.2%) | |
| High school diploma or GED | 130 (3.3%) | 122 (3.0%) | |
| Some college or technical school | 950 (24%) | 877 (22%) | |
| College graduate | 1,018 (26%) | 1,120 (28%) | |
| Graduate or professional degree | 1,763 (45%) | 1,896 (47%) | |
| smoking | <0.001 | ||
| Current daily smoker | 339 (8.6%) | 319 (7.8%) | |
| Current some-day smoker | 151 (3.8%) | 117 (2.9%) | |
| Former smoker | 1,207 (31%) | 1,055 (26%) | |
| Never smoker | 2,239 (57%) | 2,573 (63%) | |
| 1 Mean (SD); n (%) | |||
| 2 Wilcoxon rank sum test; Pearson’s Chi-squared test | |||
Part 0
First, I loaded the required packages and imported the 2023 BRFSS dataset using read_xpt(). After import, I recorded the total number of rows and columns in the raw dataset. I then selected the nine variables required for the assignment and recoded them according to the assignment instructions. Continuous variables were kept numeric, while categorical variables were converted to labeled factors. Before removing missing observations, I calculated the number and percentage of missing values for menthlth_days, bmi, and smoking. Finally, I created the analytic dataset by removing observations with missing data on any of the nine study variables and drawing a random sample of 8,000 respondents using set.seed(1220). Descriptive statistics were then produced stratified by sex.
# Part 1: Multiple Linear Regression
m_full <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + income + education + smoking,
data = brfss_analytic
)
# Step 1: full model summary
summary(m_full)
##
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise +
## age_group + income + education + smoking, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.080 -3.865 -1.597 0.712 30.471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.65053 0.95407 10.115 < 2e-16
## physhlth_days 0.26558 0.01007 26.384 < 2e-16
## bmi 0.03338 0.01321 2.527 0.011510
## sexFemale 1.39038 0.16750 8.301 < 2e-16
## exerciseYes -0.65116 0.21472 -3.033 0.002432
## age_group25-29 -1.05660 0.51938 -2.034 0.041950
## age_group30-34 -1.09395 0.50646 -2.160 0.030803
## age_group35-39 -1.81103 0.48851 -3.707 0.000211
## age_group40-44 -2.89307 0.48749 -5.935 3.07e-09
## age_group45-49 -3.05618 0.49769 -6.141 8.61e-10
## age_group50-54 -3.51674 0.48323 -7.277 3.72e-13
## age_group55-59 -4.49597 0.47555 -9.454 < 2e-16
## age_group60-64 -4.52215 0.45848 -9.863 < 2e-16
## age_group65-69 -5.57795 0.45019 -12.390 < 2e-16
## age_group70-74 -6.02536 0.45741 -13.173 < 2e-16
## age_group75-79 -6.28656 0.47484 -13.239 < 2e-16
## age_group80+ -6.81968 0.47684 -14.302 < 2e-16
## income$15,000 to < $25,000 -1.63703 0.46899 -3.491 0.000485
## income$25,000 to < $35,000 -2.07637 0.44797 -4.635 3.63e-06
## income$35,000 to < $50,000 -2.54685 0.43819 -5.812 6.40e-09
## income$50,000 to < $100,000 -3.05048 0.41069 -7.428 1.22e-13
## income$100,000 to < $200,000 -3.49984 0.42923 -8.154 4.07e-16
## income$200,000 or more -3.78409 0.50036 -7.563 4.38e-14
## educationHigh school diploma or GED 0.07991 0.81066 0.099 0.921484
## educationSome college or technical school 1.07679 0.68973 1.561 0.118520
## educationCollege graduate 1.79091 0.69119 2.591 0.009585
## educationGraduate or professional degree 1.73781 0.69250 2.509 0.012111
## smokingCurrent some-day smoker -1.58670 0.53523 -2.965 0.003041
## smokingFormer smoker -1.98971 0.33713 -5.902 3.74e-09
## smokingNever smoker -2.93681 0.32162 -9.131 < 2e-16
##
## (Intercept) ***
## physhlth_days ***
## bmi *
## sexFemale ***
## exerciseYes **
## age_group25-29 *
## age_group30-34 *
## age_group35-39 ***
## age_group40-44 ***
## age_group45-49 ***
## age_group50-54 ***
## age_group55-59 ***
## age_group60-64 ***
## age_group65-69 ***
## age_group70-74 ***
## age_group75-79 ***
## age_group80+ ***
## income$15,000 to < $25,000 ***
## income$25,000 to < $35,000 ***
## income$35,000 to < $50,000 ***
## income$50,000 to < $100,000 ***
## income$100,000 to < $200,000 ***
## income$200,000 or more ***
## educationHigh school diploma or GED
## educationSome college or technical school
## educationCollege graduate **
## educationGraduate or professional degree *
## smokingCurrent some-day smoker **
## smokingFormer smoker ***
## smokingNever smoker ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared: 0.1853, Adjusted R-squared: 0.1824
## F-statistic: 62.52 on 29 and 7970 DF, p-value: < 2.2e-16
# Step 2: coefficients for writing the fitted equation
b <- round(coef(m_full), 3)
b
## (Intercept)
## 9.651
## physhlth_days
## 0.266
## bmi
## 0.033
## sexFemale
## 1.390
## exerciseYes
## -0.651
## age_group25-29
## -1.057
## age_group30-34
## -1.094
## age_group35-39
## -1.811
## age_group40-44
## -2.893
## age_group45-49
## -3.056
## age_group50-54
## -3.517
## age_group55-59
## -4.496
## age_group60-64
## -4.522
## age_group65-69
## -5.578
## age_group70-74
## -6.025
## age_group75-79
## -6.287
## age_group80+
## -6.820
## income$15,000 to < $25,000
## -1.637
## income$25,000 to < $35,000
## -2.076
## income$35,000 to < $50,000
## -2.547
## income$50,000 to < $100,000
## -3.050
## income$100,000 to < $200,000
## -3.500
## income$200,000 or more
## -3.784
## educationHigh school diploma or GED
## 0.080
## educationSome college or technical school
## 1.077
## educationCollege graduate
## 1.791
## educationGraduate or professional degree
## 1.738
## smokingCurrent some-day smoker
## -1.587
## smokingFormer smoker
## -1.990
## smokingNever smoker
## -2.937
# Step 4: model-level statistics
model_sum <- summary(m_full)
model_sum$r.squared
## [1] 0.1853363
model_sum$adj.r.squared
## [1] 0.1823721
model_sum$sigma
## [1] 7.351628
f_stat <- model_sum$fstatistic
f_stat
## value numdf dendf
## 62.52339 29.00000 7970.00000
cat("F-statistic:", round(f_stat[1], 3), "\n")
## F-statistic: 62.523
cat("Numerator df:", f_stat[2], "\n")
## Numerator df: 29
cat("Denominator df:", f_stat[3], "\n")
## Denominator df: 7970
cat("p-value:", pf(f_stat[1], f_stat[2], f_stat[3], lower.tail = FALSE), "\n")
## p-value: 0
Part 1 Fitted equation:
menthlth_days^=9.651+0.266(physhlth_days)+0.033(bmi)+1.390(Female)−0.651(Exercise: Yes)
Reference categories are Male, No exercise, age 18–24, income < $15,000, less than high school education, and current daily smoker.
Part 1 Step 3: Interpretation of Coefficients
The results show that both continuous and categorical predictors are independently associated with the number of mentally unhealthy days.All categorical variables are interpreted relative to their reference categories (Male, No exercise, age 18–24, income < 15,000). For each additional physically unhealthy day, mentally unhealthy days increase by 0.266 days, indicating a positive relationship between physical and mental health burden. Similarly, each one-unit increase in BMI is associated with a small increase of 0.033 mentally unhealthy days. Compared to males, females report 1.390 more mentally unhealthy days, holding all other variables constant. Individuals who exercised in the past 30 days report 0.651 fewer mentally unhealthy days than those who did not exercise. For age, individuals aged 40–44 report 2.893 fewer mentally unhealthy days compared to those aged 18–24. Income also shows a strong gradient: individuals earning $50,000–$100,000 report 3.050 fewer mentally unhealthy days, and those earning $200,000 or more report 3.784 fewer days, compared to individuals earning less than $15,000, holding all other variables constant.
Part 1 Step 4: Model-Level Statistics
The model explains a moderate portion of the variation in mentally unhealthy days, with an R-squared value of 0.185, indicating that approximately 18.5% of the variability in the outcome is explained by the included predictors. The adjusted R-squared is slightly lower at 0.182, reflecting adjustment for the number of predictors and providing a more conservative estimate of model fit. The root mean squared error (RMSE) is 7.352, suggesting that the model’s predictions differ from observed values by about 7.35 days on average. The overall F-test evaluates whether all regression coefficients are equal to zero; here, the F-statistic is 62.523 with degrees of freedom (29, 7970) and a p-value less than 0.001. Therefore, the null hypothesis is rejected, indicating that the model is statistically significant and that at least one predictor is associated with mentally unhealthy days.
# Part 2: Tests of Hypotheses
# Step 1: Type III sums of squares
Anova(m_full, type = "III")
## Anova Table (Type III tests)
##
## Response: menthlth_days
## Sum Sq Df F value Pr(>F)
## (Intercept) 5530 1 102.3152 < 2.2e-16 ***
## physhlth_days 37623 1 696.1198 < 2.2e-16 ***
## bmi 345 1 6.3879 0.0115095 *
## sex 3724 1 68.9012 < 2.2e-16 ***
## exercise 497 1 9.1966 0.0024325 **
## age_group 30092 12 46.3986 < 2.2e-16 ***
## income 4734 6 14.5982 < 2.2e-16 ***
## education 1265 4 5.8521 0.0001064 ***
## smoking 5101 3 31.4613 < 2.2e-16 ***
## Residuals 430750 7970
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Step 2: Chunk test for income
m_no_income <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + education + smoking,
data = brfss_analytic
)
anova(m_no_income, m_full)
## Analysis of Variance Table
##
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group +
## education + smoking
## Model 2: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group +
## income + education + smoking
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7976 435484
## 2 7970 430750 6 4733.9 14.598 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# H0: All income coefficients = 0 (no independent effect of income)
# HA: At least one income coefficient ≠ 0
# Step 3: Chunk test for education
m_no_education <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + income + smoking,
data = brfss_analytic
)
anova(m_no_education, m_full)
## Analysis of Variance Table
##
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group +
## income + smoking
## Model 2: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group +
## income + education + smoking
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7974 432015
## 2 7970 430750 4 1265.2 5.8521 0.0001064 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# H0: All education coefficients = 0 (no independent effect of education)
# HA: At least one education coefficient ≠ 0
Part 2 Step 4 Synthesis
The Type III results show that all predictors—including physical unhealthy days, BMI, sex, exercise, age group, income, education, and smoking—have statistically significant independent associations with mentally unhealthy days at α = 0.05. Among these, physical unhealthy days and age group show the strongest contributions, as indicated by their very large F-statistics. The chunk test for income (F = 14.598, df = 6, 7970, p < 0.001) indicates that income as a group significantly improves the model after adjusting for other variables. Similarly, the chunk test for education (F = 5.852, df = 4, 7970, p < 0.001) shows that education also contributes significantly as a group. These chunk tests add important information beyond the individual t-tests by evaluating the combined effect of related categories, confirming that income and education are meaningful predictors even when some individual categories may not be statistically significant on their own.
# Part 3: Interaction Analysis
# Step 1: Create binary smoking variable
brfss_analytic <- brfss_analytic |>
mutate(
smoker_current = factor(
case_when(
smoking %in% c("Current daily smoker", "Current some-day smoker") ~ "Current smoker",
smoking %in% c("Former smoker", "Never smoker") ~ "Non-smoker",
TRUE ~ NA_character_
),
levels = c("Non-smoker", "Current smoker")
)
)
# Step 2: Fit Model A (main effects only)
model_A <- lm(
menthlth_days ~ physhlth_days + bmi + sex + smoker_current +
exercise + age_group + income + education,
data = brfss_analytic
)
summary(model_A)
##
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + smoker_current +
## exercise + age_group + income + education, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.2913 -3.8732 -1.6219 0.6681 30.4937
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.75533 0.92706 7.287 3.48e-13
## physhlth_days 0.26862 0.01007 26.673 < 2e-16
## bmi 0.03341 0.01323 2.525 0.011589
## sexFemale 1.33309 0.16732 7.967 1.84e-15
## smoker_currentCurrent smoker 2.12874 0.27121 7.849 4.74e-15
## exerciseYes -0.67248 0.21503 -3.127 0.001770
## age_group25-29 -0.91492 0.51978 -1.760 0.078412
## age_group30-34 -0.88226 0.50606 -1.743 0.081301
## age_group35-39 -1.58102 0.48765 -3.242 0.001191
## age_group40-44 -2.61572 0.48577 -5.385 7.46e-08
## age_group45-49 -2.82462 0.49698 -5.684 1.37e-08
## age_group50-54 -3.26004 0.48206 -6.763 1.45e-11
## age_group55-59 -4.23010 0.47413 -8.922 < 2e-16
## age_group60-64 -4.24840 0.45682 -9.300 < 2e-16
## age_group65-69 -5.23383 0.44671 -11.716 < 2e-16
## age_group70-74 -5.70233 0.45453 -12.546 < 2e-16
## age_group75-79 -5.89774 0.47031 -12.540 < 2e-16
## age_group80+ -6.48879 0.47372 -13.697 < 2e-16
## income$15,000 to < $25,000 -1.67974 0.46981 -3.575 0.000352
## income$25,000 to < $35,000 -2.10230 0.44863 -4.686 2.83e-06
## income$35,000 to < $50,000 -2.58691 0.43898 -5.893 3.95e-09
## income$50,000 to < $100,000 -3.08227 0.41140 -7.492 7.50e-14
## income$100,000 to < $200,000 -3.53598 0.43000 -8.223 2.30e-16
## income$200,000 or more -3.86251 0.50112 -7.708 1.43e-14
## educationHigh school diploma or GED 0.21386 0.81186 0.263 0.792237
## educationSome college or technical school 1.19653 0.69073 1.732 0.083264
## educationCollege graduate 1.90349 0.69219 2.750 0.005974
## educationGraduate or professional degree 1.74564 0.69382 2.516 0.011890
##
## (Intercept) ***
## physhlth_days ***
## bmi *
## sexFemale ***
## smoker_currentCurrent smoker ***
## exerciseYes **
## age_group25-29 .
## age_group30-34 .
## age_group35-39 **
## age_group40-44 ***
## age_group45-49 ***
## age_group50-54 ***
## age_group55-59 ***
## age_group60-64 ***
## age_group65-69 ***
## age_group70-74 ***
## age_group75-79 ***
## age_group80+ ***
## income$15,000 to < $25,000 ***
## income$25,000 to < $35,000 ***
## income$35,000 to < $50,000 ***
## income$50,000 to < $100,000 ***
## income$100,000 to < $200,000 ***
## income$200,000 or more ***
## educationHigh school diploma or GED
## educationSome college or technical school .
## educationCollege graduate **
## educationGraduate or professional degree *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.366 on 7972 degrees of freedom
## Multiple R-squared: 0.182, Adjusted R-squared: 0.1792
## F-statistic: 65.7 on 27 and 7972 DF, p-value: < 2.2e-16
# Fit Model B (with interaction)
model_B <- lm(
menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
exercise + age_group + income + education,
data = brfss_analytic
)
summary(model_B)
##
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
## exercise + age_group + income + education, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.337 -3.837 -1.604 0.628 30.426
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.89938 0.92857 7.430 1.20e-13
## physhlth_days 0.26859 0.01007 26.679 < 2e-16
## bmi 0.03309 0.01323 2.502 0.012380
## sexFemale 1.18553 0.17752 6.678 2.58e-11
## smoker_currentCurrent smoker 1.52079 0.36539 4.162 3.19e-05
## exerciseYes -0.67285 0.21496 -3.130 0.001753
## age_group25-29 -0.92018 0.51962 -1.771 0.076616
## age_group30-34 -0.89242 0.50591 -1.764 0.077774
## age_group35-39 -1.59287 0.48752 -3.267 0.001090
## age_group40-44 -2.62864 0.48564 -5.413 6.39e-08
## age_group45-49 -2.84255 0.49687 -5.721 1.10e-08
## age_group50-54 -3.27784 0.48195 -6.801 1.11e-11
## age_group55-59 -4.24987 0.47404 -8.965 < 2e-16
## age_group60-64 -4.26404 0.45671 -9.336 < 2e-16
## age_group65-69 -5.25062 0.44662 -11.756 < 2e-16
## age_group70-74 -5.71106 0.45439 -12.569 < 2e-16
## age_group75-79 -5.90758 0.47018 -12.565 < 2e-16
## age_group80+ -6.49946 0.47359 -13.724 < 2e-16
## income$15,000 to < $25,000 -1.63574 0.46999 -3.480 0.000503
## income$25,000 to < $35,000 -2.07457 0.44862 -4.624 3.82e-06
## income$35,000 to < $50,000 -2.54551 0.43915 -5.796 7.03e-09
## income$50,000 to < $100,000 -3.04298 0.41157 -7.394 1.57e-13
## income$100,000 to < $200,000 -3.50972 0.42999 -8.162 3.79e-16
## income$200,000 or more -3.84047 0.50103 -7.665 2.00e-14
## educationHigh school diploma or GED 0.12556 0.81238 0.155 0.877177
## educationSome college or technical school 1.11789 0.69123 1.617 0.105865
## educationCollege graduate 1.81788 0.69283 2.624 0.008711
## educationGraduate or professional degree 1.66905 0.69428 2.404 0.016240
## sexFemale:smoker_currentCurrent smoker 1.28327 0.51705 2.482 0.013089
##
## (Intercept) ***
## physhlth_days ***
## bmi *
## sexFemale ***
## smoker_currentCurrent smoker ***
## exerciseYes **
## age_group25-29 .
## age_group30-34 .
## age_group35-39 **
## age_group40-44 ***
## age_group45-49 ***
## age_group50-54 ***
## age_group55-59 ***
## age_group60-64 ***
## age_group65-69 ***
## age_group70-74 ***
## age_group75-79 ***
## age_group80+ ***
## income$15,000 to < $25,000 ***
## income$25,000 to < $35,000 ***
## income$35,000 to < $50,000 ***
## income$50,000 to < $100,000 ***
## income$100,000 to < $200,000 ***
## income$200,000 or more ***
## educationHigh school diploma or GED
## educationSome college or technical school
## educationCollege graduate **
## educationGraduate or professional degree *
## sexFemale:smoker_currentCurrent smoker *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.363 on 7971 degrees of freedom
## Multiple R-squared: 0.1826, Adjusted R-squared: 0.1798
## F-statistic: 63.61 on 28 and 7971 DF, p-value: < 2.2e-16
# Step 3: Compare the two models
anova(model_A, model_B)
## Analysis of Variance Table
##
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + smoker_current +
## exercise + age_group + income + education
## Model 2: menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
## exercise + age_group + income + education
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7972 432509
## 2 7971 432175 1 333.97 6.1598 0.01309 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# H0: The interaction between sex and current smoking = 0
# HA: The interaction between sex and current smoking ≠ 0
# Step 4: Interpret the interaction using Model B coefficients
coef(model_B)
## (Intercept)
## 6.89937969
## physhlth_days
## 0.26858758
## bmi
## 0.03308926
## sexFemale
## 1.18553147
## smoker_currentCurrent smoker
## 1.52079072
## exerciseYes
## -0.67284825
## age_group25-29
## -0.92018465
## age_group30-34
## -0.89241633
## age_group35-39
## -1.59287470
## age_group40-44
## -2.62863606
## age_group45-49
## -2.84254506
## age_group50-54
## -3.27783622
## age_group55-59
## -4.24986875
## age_group60-64
## -4.26403799
## age_group65-69
## -5.25061692
## age_group70-74
## -5.71105572
## age_group75-79
## -5.90757854
## age_group80+
## -6.49945710
## income$15,000 to < $25,000
## -1.63574119
## income$25,000 to < $35,000
## -2.07456733
## income$35,000 to < $50,000
## -2.54550750
## income$50,000 to < $100,000
## -3.04298017
## income$100,000 to < $200,000
## -3.50972483
## income$200,000 or more
## -3.84047036
## educationHigh school diploma or GED
## 0.12555591
## educationSome college or technical school
## 1.11789017
## educationCollege graduate
## 1.81787943
## educationGraduate or professional degree
## 1.66905037
## sexFemale:smoker_currentCurrent smoker
## 1.28327305
# Effect of current smoking among men
men_effect <- coef(model_B)["smoker_currentCurrent smoker"]
men_effect
## smoker_currentCurrent smoker
## 1.520791
# Effect of current smoking among women
women_effect <- coef(model_B)["smoker_currentCurrent smoker"] +
coef(model_B)["sexFemale:smoker_currentCurrent smoker"]
women_effect
## smoker_currentCurrent smoker
## 2.804064
cat("Estimated effect of current smoking among men:", round(men_effect, 3), "\n")
## Estimated effect of current smoking among men: 1.521
cat("Estimated effect of current smoking among women:", round(women_effect, 3), "\n")
## Estimated effect of current smoking among women: 2.804
cat("Difference in smoking effect between women and men:",
round(coef(model_B)["sexFemale:smoker_currentCurrent smoker"], 3), "\n")
## Difference in smoking effect between women and men: 1.283
# Step 5: Visualize the interaction
plot_data <- ggpredict(model_B, terms = c("smoker_current", "sex"))
ggplot(plot_data, aes(x = x, y = predicted, color = group, group = group)) +
geom_point(size = 3) +
geom_line() +
labs(
x = "Smoking status",
y = "Predicted mentally unhealthy days",
color = "Sex",
title = "Predicted mentally unhealthy days by smoking status and sex"
) +
theme_minimal()
Part 3 Step 3: Interaction Test
The null hypothesis is that there is no interaction between sex and current smoking on mentally unhealthy days (β_interaction = 0), while the alternative hypothesis is that the interaction is not equal to zero. The model comparison shows an F-statistic of 6.160 with 1 and 7971 degrees of freedom and a p-value of 0.013. Since the p-value is less than 0.05, we reject the null hypothesis and conclude that there is a statistically significant interaction between sex and current smoking.
Part 3 Step 4: Interpretation of Interaction
Among men, the estimated effect of current smoking is 1.521, meaning that current smokers report about 1.52 more mentally unhealthy days compared to non-smokers, holding all other variables constant. Among women, the estimated effect is 2.804, meaning that current smokers report about 2.80 more mentally unhealthy days compared to non-smokers, holding other variables constant. Together, these results indicate that the association between smoking and mentally unhealthy days is stronger among women than men, with women experiencing about 1.28 more additional mentally unhealthy days due to smoking compared to men.
Part 3 Step 6: Public Health Interpretation
The results suggest that smoking is associated with worse mental health, and this relationship differs between men and women. In particular, the negative impact of smoking on mental well-being appears to be stronger among women than men. This indicates that women who smoke may experience a greater burden of mental health challenges compared to male smokers. These findings highlight the importance of considering gender differences when designing public health interventions aimed at reducing smoking and improving mental health outcomes.
Part 4 Complete Summary
The results of this analysis suggest that multiple factors are associated with mental health among U.S. adults. Physical unhealthy days showed the strongest association, indicating that individuals with poorer physical health tend to experience more mentally unhealthy days. Age group was also an important predictor, with younger adults reporting more mentally unhealthy days compared to older adults. Smoking and sex were significant as well, and the interaction analysis showed that smoking has a stronger negative effect on mental health among women than men. Income and education were both significant in the chunk tests, suggesting that socioeconomic status plays an important role in mental health. BMI had a statistically significant but relatively small effect, while some individual education categories were not significant, even though the overall education variable was important.
There are several limitations to this analysis. First, the data are cross-sectional, which means we cannot establish causality or determine the direction of the relationships. For example, poor mental health may lead to unhealthy behaviors, rather than the other way around. Second, the data rely on self-reported measures, which may introduce recall bias or reporting bias. There may also be unmeasured confounding variables. For example, access to healthcare and social support are important factors that could influence both mental health and the predictors included in the model. Additionally, factors such as stress levels or chronic conditions could confound the observed associations.
Adjusted R-squared provides a more accurate measure of model fit than regular R-squared because it accounts for the number of predictors in the model. While R-squared always increases when more variables are added, adjusted R-squared only increases if the new variables improve the model meaningfully. This is especially important when including multiple categorical variables, which add many parameters to the model. Chunk tests are also useful because they evaluate the overall contribution of a group of related variables, such as income or education. Even if individual categories are not statistically significant, the group as a whole may still be important, which cannot be fully captured by individual t-tests alone.
AI Transparency
I was using the selected variables from the dataset as it was given in the HW instructions, that means all in capitals. My code was not working from the very beginning, Then I asked the AI about what could be the possible reasons for these.AI suggested that there could be several reasons and told me to run names() so that I can see all the variables present in the dataset and select from there.