# Global chunk options and required packages
knitr::opts_chunk$set(echo=TRUE, message=FALSE, warning=FALSE)
# Core data manipulation and survey-weighted estimation
options(repos = c(CRAN = "https://cloud.r-project.org")) # set CRAN mirror for non-interactive knit
install.packages(c("tidyverse","survey","dplyr", "mice"), repos = getOption("repos")) # ensure mirror is used
library(tidyverse)
library(survey)
library(dplyr)
library(mice)
# Handle strata with a single PSU after subsetting (e.g., Texas-only)
options(survey.lonely.psu = "adjust")
# Load BRFSS 2011 data (pre-processed to RDS)
BRFSS2011 <- readRDS("C:/Users/63650/Downloads/BRFSS2011.RDS")
# Restrict sample to Texas respondents (state FIPS = 48)
BRFSS2011_TX <- BRFSS2011 %>% filter(x.state == 48)
# Revised variable list (core predictors + access + diagnostics)
keep_2011 <- c(
# Survey design (required, not predictors)
"x.state","x.psu","x.ststr","x.llcpwt",
# Outcome
"diabete3",
# 1) Demographics
"age",
"sex",
"x.imprace",
# 2) Biological / cardiometabolic risk
"x.bmi5",
"bphigh4",
"toldhi2",
# 3) Behavioral risk
"x.smoker3",
"exerany2",
"drnkany5",
# 4) Socioeconomic position
"educa",
"income2",
"employ",
# 5) Access to care / detection
"hlthpln1",
"checkup1",
"medcost",
# 6) Geography / context (Texas-specific)
"x.region",
"ctycode1",
"persdoc2"
)
# Keep only the analysis variables that actually exist in this year's file
df <- BRFSS2011_TX %>%
select(all_of(intersect(keep_2011, names(BRFSS2011_TX))))
# Recode BRFSS non-substantive response codes to NA
# (DK/Not sure, Refused, and similar special codes)
# Marks entries missing for later handling (e.g., MICE)
# Diabetes source variable: drop DK/refused
df$diabete3[df$diabete3 %in% c(7,9)] <- NA
# Demographics
df$age[df$age %in% c(7, 9)] <- NA
# Restrict to adults
df <- df %>% filter(age >= 18)
# sex by default 1 for male, 2 for female
df$x.imprace[!(df$x.imprace %in% 1:6)] <- NA
# Cardiometabolic risk
df$bphigh4[df$bphigh4 %in% c(2,4,7,9)] <- NA # exclude pregnancy-only, borderline, DK/refused
df$toldhi2[df$toldhi2 %in% c(7,9)] <- NA
# Behavioral risk
df$x.smoker3[df$x.smoker3 == 9] <- NA
df$exerany2[df$exerany2 %in% c(7,9)] <- NA
df$drnkany5[df$drnkany5 %in% c(7,9)] <- NA
# Socioeconomic position
df$educa[df$educa == 9] <- NA
df$income2[df$income2 %in% c(77,99)] <- NA
df$employ[df$employ == 9] <- NA
# Access to care
df$hlthpln1[df$hlthpln1 %in% c(7,9)] <- NA
df$checkup1[df$checkup1 %in% c(7,9)] <- NA # keep 8 = "Never" as a valid category
df$medcost[df$medcost %in% c(7,9)] <- NA
df$persdoc2[df$persdoc2 %in% c(7,9)] <- NA
# Geography
# x.region has no NA
df$ctycode1[df$ctycode1 %in% c(777,999)] <- NA
df$persdoc2[df$persdoc2 %in% c(7,9)] <- NA
# BMI is computed; keep numeric and scale
df$x.bmi5 <- as.numeric(df$x.bmi5)
# Remove impossible values
df$x.bmi5[df$x.bmi5 %in% c(9999, 0)] <- NA
# Plausible analytic range (BMI 12–80)
df$x.bmi5[df$x.bmi5 < 1200 | df$x.bmi5 > 8000] <- NA
# Create scaled BMI in standard units for modeling
df$bmi <- df$x.bmi5 / 100
# Construct binary diagnosed diabetes outcome
# Keep: 1 = Yes, 3 = No
# Exclude: 2 = gestational only, 4 = prediabetes/borderline, plus missing
# Then drop missing outcome (do NOT impute outcome)
df <- df %>%
mutate(diabetes_dx = case_when(
diabete3 == 1 ~ 1,
diabete3 == 3 ~ 0,
TRUE ~ NA_real_
)) %>%
filter(!is.na(diabetes_dx))
# Set variable types for MICE
# Outcome stays as factor (NOT imputed)
# Numeric: age, bmi
# Binary indicators as factors (0/1)
# Ordered: educa, income2, checkup1
# Nominal multi-category: x.imprace, employ, x.smoker3, persdoc2, x.region, ctycode1
df$diabetes_dx <- factor(df$diabetes_dx, levels = c(0,1))
# Continuous
df$age <- as.numeric(df$age)
df$bmi <- as.numeric(df$bmi)
# High blood pressure: 1=Yes, 3=No -> 1/0
df$bphigh_bin <- NA_integer_
df$bphigh_bin[df$bphigh4 == 1] <- 1
df$bphigh_bin[df$bphigh4 == 3] <- 0
# Told cholesterol high: 1=Yes, 2=No -> 1/0
df$toldhi_bin <- NA_integer_
df$toldhi_bin[df$toldhi2 == 1] <- 1
df$toldhi_bin[df$toldhi2 == 2] <- 0
# Exercise past 30 days: 1=Yes, 2=No -> 1/0
df$exer_bin <- NA_integer_
df$exer_bin[df$exerany2 == 1] <- 1
df$exer_bin[df$exerany2 == 2] <- 0
# Any alcohol past 30 days: 1=Yes, 2=No -> 1/0
df$drink_bin <- NA_integer_
df$drink_bin[df$drnkany5 == 1] <- 1
df$drink_bin[df$drnkany5 == 2] <- 0
# Health insurance coverage: 1=Yes, 2=No -> 1/0
df$hlthpln_bin <- NA_integer_
df$hlthpln_bin[df$hlthpln1 == 1] <- 1
df$hlthpln_bin[df$hlthpln1 == 2] <- 0
# Could not see doctor due to cost: 1=Yes, 2=No -> 1/0
df$medcost_bin <- NA_integer_
df$medcost_bin[df$medcost == 1] <- 1
df$medcost_bin[df$medcost == 2] <- 0
# Personal doctor/provider as binary:(1/2)=1, (3)=0 (any provider vs none)
df$persdoc_any_bin <- NA_integer_
df$persdoc_any_bin[df$persdoc2 %in% c(1, 2)] <- 1
df$persdoc_any_bin[df$persdoc2 == 3] <- 0
# Binary 0/1 predictors: factors so mice uses logistic regression
df$bphigh_bin <- factor(df$bphigh_bin, levels = c(0, 1))
df$toldhi_bin <- factor(df$toldhi_bin, levels = c(0, 1))
df$exer_bin <- factor(df$exer_bin, levels = c(0, 1))
df$drink_bin <- factor(df$drink_bin, levels = c(0, 1))
df$hlthpln_bin <- factor(df$hlthpln_bin, levels = c(0, 1))
df$medcost_bin <- factor(df$medcost_bin, levels = c(0, 1))
df$persdoc_any_bin <- factor(df$persdoc_any_bin, levels = c(0, 1))
# Ordered / nominal variables (NOT forced to 1/0)
df$sex <- factor(df$sex, levels = c(1, 2))
df$educa <- ordered(df$educa, levels = 1:6)
df$income2 <- ordered(df$income2, levels = 1:8)
df$employ <- factor(df$employ, levels = 1:8)
df$x.imprace <- factor(df$x.imprace, levels = 1:6)
df$x.smoker3 <- factor(df$x.smoker3, levels = 1:4)
# Checkup timing (8 = "Never" as valid ordered category)
df$checkup1 <- ordered(df$checkup1, levels = c(1, 2, 3, 4, 8))
# Geography IDs (excluded from MICE, keep as factors for later)
df$x.region <- factor(df$x.region)
df$ctycode1 <- factor(df$ctycode1)
# Run MICE (exclude survey design vars + IDs from imputation)
# Let MICE automatically choose an imputation model (logistic, PMM, etc.) based on variable type
meth <- make.method(df)
# Create the matrix that controls which variables can predict missing values in others
pred <- make.predictorMatrix(df)
# DO NOT EVER IMPUTE THE OUTCOME (and do not use it to predict others)
meth["diabetes_dx"] <- ""
pred[, "diabetes_dx"] <- 0
pred["diabetes_dx", ] <- 0
# Force BMI to use predictive mean matching to avoid implausible imputed values
meth["bmi"] <- "pmm"
meth["x.bmi5"] <- ""
# Prevent survey design variables and county identifiers from being imputed
meth[c("x.state","x.psu","x.ststr","x.llcpwt","ctycode1","x.region")] <- ""
# Do not impute original multi-code versions when you are using the 0/1 bins in the model
meth[c("bphigh4","toldhi2","exerany2","drnkany5","hlthpln1","medcost","persdoc2")] <- ""
# Impute the 0/1 binary predictors using logistic regression
meth[c("bphigh_bin","toldhi_bin","exer_bin","drink_bin","hlthpln_bin","medcost_bin","persdoc_any_bin")] <- "logreg"
# Exclude survey design variables and county from predicting other variables
# because they are identifiers, not substantive covariates
pred[, c("x.state","x.psu","x.ststr","x.llcpwt","ctycode1","x.region")] <- 0
# Exclude survey design variables and county from being predicted themselves
pred[c("x.state","x.psu","x.ststr","x.llcpwt","ctycode1","x.region"), ] <- 0
# Also remove the original multi-code versions as predictors/targets since we are imputing the bins instead
pred[, c("bphigh4","toldhi2","exerany2","drnkany5","hlthpln1","medcost","persdoc2","x.bmi5")] <- 0
pred[c("bphigh4","toldhi2","exerany2","drnkany5","hlthpln1","medcost","persdoc2","x.bmi5"), ] <- 0
# Run multiple imputation with a small number of datasets and iterations for feasibility
imp <- mice(df, m = 5, maxit = 2, method = meth, predictorMatrix = pred, seed = 123)
# Extract the completed imputed datasets for analysis
d1 <- complete(imp, 1)
d2 <- complete(imp, 2)
# Survey design using the cleaned (non-imputed) dataset
# Used as a baseline / complete-case comparison
des_cc <- svydesign(
ids = ~x.psu, # primary sampling units (clusters)
strata = ~x.ststr, # stratification variable
weights = ~x.llcpwt,# BRFSS final survey weights
data = df,
nest = TRUE # PSUs are nested within strata
)
# Survey design for imputed dataset 1
# Preserves BRFSS sampling structure after imputation
des1 <- svydesign(
ids = ~x.psu,
strata = ~x.ststr,
weights = ~x.llcpwt,
data = d1,
nest = TRUE
)
# Survey design for imputed dataset 2
# Allows analysis and pooling across multiple imputations
des2 <- svydesign(
ids = ~x.psu,
strata = ~x.ststr,
weights = ~x.llcpwt,
data = d2,
nest = TRUE
)
# BASELINE MODEL (no FE, no interactions)
f_base <- diabetes_dx ~
age + sex + x.imprace +
bmi + bphigh_bin +
x.smoker3 + exer_bin +
educa + income2 +
hlthpln_bin + checkup1
# Complete case
m1_cc_base <- svyglm(f_base, design = des_cc, family = quasibinomial())
# Imputation 1
m1_d1_base <- svyglm(f_base, design = des1, family = quasibinomial())
# Imputation 2
m1_d2_base <- svyglm(f_base, design = des2, family = quasibinomial())
# Used to establish raw associational relationships
broom::tidy(m1_cc_base)
broom::tidy(m1_d1_base)
broom::tidy(m1_d2_base)
# MODEL + REGION FIXED EFFECTS
f_fe <- update(f_base, . ~ . + factor(x.region))
m2_cc_fe <- svyglm(f_fe, design = des_cc, family = quasibinomial())
m2_d1_fe <- svyglm(f_fe, design = des1, family = quasibinomial())
m2_d2_fe <- svyglm(f_fe, design = des2, family = quasibinomial())
# Controls for unobserved regional differences within Texas
broom::tidy(m2_cc_fe)
broom::tidy(m2_d1_fe)
broom::tidy(m2_d2_fe)
# MODEL + FIXED EFFECTS + INTERACTIONS
f_fe_int <- update(
f_fe,
. ~ . +
bmi:x.smoker3 +
bmi:exer_bin +
income2:hlthpln_bin +
educa:hlthpln_bin
)
m3_cc_fe_int <- svyglm(f_fe_int, design = des_cc, family = quasibinomial())
m3_d1_fe_int <- svyglm(f_fe_int, design = des1, family = quasibinomial())
m3_d2_fe_int <- svyglm(f_fe_int, design = des2, family = quasibinomial())
# Tests whether key relationships differ across risk-factor combinations
broom::tidy(m3_cc_fe_int)
broom::tidy(m3_d1_fe_int)
broom::tidy(m3_d2_fe_int)
# MODEL + FIXED EFFECTS + NONLINEAR TERMS
f_fe_nl <- update(
f_fe,
. ~ . +
I(bmi^2) +
I(age^2) +
I(as.numeric(income2)^2)
)
m4_cc_fe_nl <- svyglm(f_fe_nl, design = des_cc, family = quasibinomial())
m4_d1_fe_nl <- svyglm(f_fe_nl, design = des1, family = quasibinomial())
m4_d2_fe_nl <- svyglm(f_fe_nl, design = des2, family = quasibinomial())
# Tests for curvature and threshold effects in continuous predictors
broom::tidy(m4_cc_fe_nl)
broom::tidy(m4_d1_fe_nl)
broom::tidy(m4_d2_fe_nl)
# MODEL + FIXED EFFECTS + INTERACTIONS + NONLINEAR TERMS
f_full <- update(
f_fe,
. ~ . +
bmi:x.smoker3 +
bmi:exer_bin +
income2:hlthpln_bin +
educa:hlthpln_bin +
I(bmi^2) +
I(age^2) +
I(as.numeric(income2)^2)
)
m5_cc_full <- svyglm(f_full, design = des_cc, family = quasibinomial())
m5_d1_full <- svyglm(f_full, design = des1, family = quasibinomial())
m5_d2_full <- svyglm(f_full, design = des2, family = quasibinomial())
# Stress-test specification to assess robustness and overfitting sensitivity
broom::tidy(m5_cc_full)
broom::tidy(m5_d1_full)
broom::tidy(m5_d2_full)