BRFSS 2011 Imputation of Diabetes Predictors Analysis

# Global chunk options and required packages
knitr::opts_chunk$set(echo=TRUE, message=FALSE, warning=FALSE)

# Core data manipulation and survey-weighted estimation
options(repos = c(CRAN = "https://cloud.r-project.org"))  # set CRAN mirror for non-interactive knit
install.packages(c("tidyverse","survey","dplyr", "mice"), repos = getOption("repos"))  # ensure mirror is used
library(tidyverse)
library(survey)
library(dplyr)
library(mice)

# Handle strata with a single PSU after subsetting (e.g., Texas-only)
options(survey.lonely.psu = "adjust")

# Load BRFSS 2011 data (pre-processed to RDS)
BRFSS2011 <- readRDS("C:/Users/63650/Downloads/BRFSS2011.RDS")
# Restrict sample to Texas respondents (state FIPS = 48)
BRFSS2011_TX <- BRFSS2011 %>% filter(x.state == 48)

# Revised variable list (core predictors + access + diagnostics)
keep_2011 <- c(
  # Survey design (required, not predictors)
  "x.state","x.psu","x.ststr","x.llcpwt",

  # Outcome 
  "diabete3",

  # 1) Demographics
  "age",
  "sex",
  "x.imprace",

  # 2) Biological / cardiometabolic risk
  "x.bmi5",
  "bphigh4",
  "toldhi2",

  # 3) Behavioral risk
  "x.smoker3",
  "exerany2",
  "drnkany5",

  # 4) Socioeconomic position
  "educa",
  "income2",
  "employ",

  # 5) Access to care / detection
  "hlthpln1",
  "checkup1",
  "medcost",

  # 6) Geography / context (Texas-specific)
  "x.region",
  "ctycode1",
  "persdoc2"
)

# Keep only the analysis variables that actually exist in this year's file
df <- BRFSS2011_TX %>%
  select(all_of(intersect(keep_2011, names(BRFSS2011_TX))))

# Recode BRFSS non-substantive response codes to NA
# (DK/Not sure, Refused, and similar special codes)
# Marks entries missing for later handling (e.g., MICE)

# Diabetes source variable: drop DK/refused
df$diabete3[df$diabete3 %in% c(7,9)] <- NA

# Demographics
df$age[df$age %in% c(7, 9)] <- NA
# Restrict to adults
df <- df %>% filter(age >= 18)
# sex by default 1 for male, 2 for female
df$x.imprace[!(df$x.imprace %in% 1:6)] <- NA

# Cardiometabolic risk
df$bphigh4[df$bphigh4 %in% c(2,4,7,9)] <- NA # exclude pregnancy-only, borderline, DK/refused
df$toldhi2[df$toldhi2 %in% c(7,9)] <- NA

# Behavioral risk
df$x.smoker3[df$x.smoker3 == 9] <- NA
df$exerany2[df$exerany2 %in% c(7,9)] <- NA
df$drnkany5[df$drnkany5 %in% c(7,9)] <- NA

# Socioeconomic position
df$educa[df$educa == 9] <- NA
df$income2[df$income2 %in% c(77,99)] <- NA
df$employ[df$employ == 9] <- NA

# Access to care
df$hlthpln1[df$hlthpln1 %in% c(7,9)] <- NA
df$checkup1[df$checkup1 %in% c(7,9)] <- NA # keep 8 = "Never" as a valid category
df$medcost[df$medcost %in% c(7,9)] <- NA
df$persdoc2[df$persdoc2 %in% c(7,9)] <- NA

# Geography
# x.region has no NA
df$ctycode1[df$ctycode1 %in% c(777,999)] <- NA
df$persdoc2[df$persdoc2 %in% c(7,9)] <- NA

# BMI is computed; keep numeric and scale
df$x.bmi5 <- as.numeric(df$x.bmi5)
# Remove impossible values
df$x.bmi5[df$x.bmi5 %in% c(9999, 0)] <- NA
# Plausible analytic range (BMI 12–80)
df$x.bmi5[df$x.bmi5 < 1200 | df$x.bmi5 > 8000] <- NA
# Create scaled BMI in standard units for modeling
df$bmi <- df$x.bmi5 / 100

# Construct binary diagnosed diabetes outcome
# Keep: 1 = Yes, 3 = No
# Exclude: 2 = gestational only, 4 = prediabetes/borderline, plus missing
# Then drop missing outcome (do NOT impute outcome)

df <- df %>%
  mutate(diabetes_dx = case_when(
    diabete3 == 1 ~ 1,
    diabete3 == 3 ~ 0,
    TRUE ~ NA_real_
  )) %>%
  filter(!is.na(diabetes_dx))

# Set variable types for MICE
# Outcome stays as factor (NOT imputed)
# Numeric: age, bmi
# Binary indicators as factors (0/1)
# Ordered: educa, income2, checkup1
# Nominal multi-category: x.imprace, employ, x.smoker3, persdoc2, x.region, ctycode1

df$diabetes_dx <- factor(df$diabetes_dx, levels = c(0,1))

# Continuous
df$age <- as.numeric(df$age)
df$bmi <- as.numeric(df$bmi)

# High blood pressure: 1=Yes, 3=No  -> 1/0
df$bphigh_bin <- NA_integer_
df$bphigh_bin[df$bphigh4 == 1] <- 1
df$bphigh_bin[df$bphigh4 == 3] <- 0

# Told cholesterol high: 1=Yes, 2=No -> 1/0
df$toldhi_bin <- NA_integer_
df$toldhi_bin[df$toldhi2 == 1] <- 1
df$toldhi_bin[df$toldhi2 == 2] <- 0

# Exercise past 30 days: 1=Yes, 2=No -> 1/0
df$exer_bin <- NA_integer_
df$exer_bin[df$exerany2 == 1] <- 1
df$exer_bin[df$exerany2 == 2] <- 0

# Any alcohol past 30 days: 1=Yes, 2=No -> 1/0
df$drink_bin <- NA_integer_
df$drink_bin[df$drnkany5 == 1] <- 1
df$drink_bin[df$drnkany5 == 2] <- 0

# Health insurance coverage: 1=Yes, 2=No -> 1/0
df$hlthpln_bin <- NA_integer_
df$hlthpln_bin[df$hlthpln1 == 1] <- 1
df$hlthpln_bin[df$hlthpln1 == 2] <- 0

# Could not see doctor due to cost: 1=Yes, 2=No -> 1/0
df$medcost_bin <- NA_integer_
df$medcost_bin[df$medcost == 1] <- 1
df$medcost_bin[df$medcost == 2] <- 0

# Personal doctor/provider as binary:(1/2)=1, (3)=0 (any provider vs none)
df$persdoc_any_bin <- NA_integer_
df$persdoc_any_bin[df$persdoc2 %in% c(1, 2)] <- 1
df$persdoc_any_bin[df$persdoc2 == 3] <- 0

# Binary 0/1 predictors: factors so mice uses logistic regression
df$bphigh_bin       <- factor(df$bphigh_bin, levels = c(0, 1))
df$toldhi_bin       <- factor(df$toldhi_bin, levels = c(0, 1))
df$exer_bin         <- factor(df$exer_bin, levels = c(0, 1))
df$drink_bin        <- factor(df$drink_bin, levels = c(0, 1))
df$hlthpln_bin      <- factor(df$hlthpln_bin, levels = c(0, 1))
df$medcost_bin      <- factor(df$medcost_bin, levels = c(0, 1))
df$persdoc_any_bin  <- factor(df$persdoc_any_bin, levels = c(0, 1))

# Ordered / nominal variables (NOT forced to 1/0)
df$sex <- factor(df$sex, levels = c(1, 2))
df$educa   <- ordered(df$educa, levels = 1:6)
df$income2 <- ordered(df$income2, levels = 1:8)
df$employ  <- factor(df$employ, levels = 1:8)
df$x.imprace <- factor(df$x.imprace, levels = 1:6)
df$x.smoker3 <- factor(df$x.smoker3, levels = 1:4)

# Checkup timing (8 = "Never" as valid ordered category)
df$checkup1 <- ordered(df$checkup1, levels = c(1, 2, 3, 4, 8))

# Geography IDs (excluded from MICE, keep as factors for later)
df$x.region <- factor(df$x.region)
df$ctycode1 <- factor(df$ctycode1)

# Run MICE (exclude survey design vars + IDs from imputation)

# Let MICE automatically choose an imputation model (logistic, PMM, etc.) based on variable type
meth <- make.method(df)

# Create the matrix that controls which variables can predict missing values in others
pred <- make.predictorMatrix(df)

# DO NOT EVER IMPUTE THE OUTCOME (and do not use it to predict others)
meth["diabetes_dx"] <- ""
pred[, "diabetes_dx"] <- 0
pred["diabetes_dx", ] <- 0

# Force BMI to use predictive mean matching to avoid implausible imputed values
meth["bmi"]   <- "pmm"
meth["x.bmi5"] <- ""

# Prevent survey design variables and county identifiers from being imputed
meth[c("x.state","x.psu","x.ststr","x.llcpwt","ctycode1","x.region")] <- ""

# Do not impute original multi-code versions when you are using the 0/1 bins in the model
meth[c("bphigh4","toldhi2","exerany2","drnkany5","hlthpln1","medcost","persdoc2")] <- ""

# Impute the 0/1 binary predictors using logistic regression
meth[c("bphigh_bin","toldhi_bin","exer_bin","drink_bin","hlthpln_bin","medcost_bin","persdoc_any_bin")] <- "logreg"

# Exclude survey design variables and county from predicting other variables
# because they are identifiers, not substantive covariates
pred[, c("x.state","x.psu","x.ststr","x.llcpwt","ctycode1","x.region")] <- 0

# Exclude survey design variables and county from being predicted themselves
pred[c("x.state","x.psu","x.ststr","x.llcpwt","ctycode1","x.region"), ] <- 0

# Also remove the original multi-code versions as predictors/targets since we are imputing the bins instead
pred[, c("bphigh4","toldhi2","exerany2","drnkany5","hlthpln1","medcost","persdoc2","x.bmi5")] <- 0
pred[c("bphigh4","toldhi2","exerany2","drnkany5","hlthpln1","medcost","persdoc2","x.bmi5"), ] <- 0

# Run multiple imputation with a small number of datasets and iterations for feasibility
imp <- mice(df, m = 5, maxit = 2, method = meth, predictorMatrix = pred, seed = 123)

# Extract the completed imputed datasets for analysis
d1 <- complete(imp, 1)
d2 <- complete(imp, 2)

# Survey design using the cleaned (non-imputed) dataset
# Used as a baseline / complete-case comparison
des_cc <- svydesign(
  ids = ~x.psu,        # primary sampling units (clusters)
  strata = ~x.ststr,   # stratification variable
  weights = ~x.llcpwt,# BRFSS final survey weights
  data = df,
  nest = TRUE          # PSUs are nested within strata
)

# Survey design for imputed dataset 1
# Preserves BRFSS sampling structure after imputation
des1 <- svydesign(
  ids = ~x.psu,
  strata = ~x.ststr,
  weights = ~x.llcpwt,
  data = d1,
  nest = TRUE
)

# Survey design for imputed dataset 2
# Allows analysis and pooling across multiple imputations
des2 <- svydesign(
  ids = ~x.psu,
  strata = ~x.ststr,
  weights = ~x.llcpwt,
  data = d2,
  nest = TRUE
)

# BASELINE MODEL (no FE, no interactions)

f_base <- diabetes_dx ~
  age + sex + x.imprace +
  bmi + bphigh_bin +
  x.smoker3 + exer_bin +
  educa + income2 +
  hlthpln_bin + checkup1

# Complete case
m1_cc_base <- svyglm(f_base, design = des_cc, family = quasibinomial())

# Imputation 1
m1_d1_base <- svyglm(f_base, design = des1, family = quasibinomial())

# Imputation 2
m1_d2_base <- svyglm(f_base, design = des2, family = quasibinomial())

# Used to establish raw associational relationships
broom::tidy(m1_cc_base)

## # A tibble: 31 × 5
##    term        estimate std.error statistic  p.value
##    <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)  -7.56     0.450     -16.8   1.62e-62
##  2 age           0.0377   0.00426     8.83  1.19e-18
##  3 sex2         -0.303    0.112      -2.70  6.87e- 3
##  4 x.imprace2    0.189    0.190       0.996 3.19e- 1
##  5 x.imprace3    1.12     0.472       2.38  1.75e- 2
##  6 x.imprace4    0.903    0.398       2.27  2.34e- 2
##  7 x.imprace5    0.626    0.145       4.32  1.58e- 5
##  8 x.imprace6   -0.641    0.453      -1.42  1.57e- 1
##  9 bmi           0.0831   0.00805    10.3   8.11e-25
## 10 bphigh_bin1   1.14     0.129       8.88  7.75e-19
## # ℹ 21 more rows

broom::tidy(m1_d1_base)

## # A tibble: 31 × 5
##    term        estimate std.error statistic  p.value
##    <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)  -7.42     0.386     -19.2   2.88e-81
##  2 age           0.0391   0.00349    11.2   4.75e-29
##  3 sex2         -0.345    0.0977     -3.53  4.20e- 4
##  4 x.imprace2    0.123    0.165       0.745 4.56e- 1
##  5 x.imprace3    0.874    0.402       2.18  2.95e- 2
##  6 x.imprace4    0.545    0.363       1.50  1.33e- 1
##  7 x.imprace5    0.624    0.125       5.01  5.51e- 7
##  8 x.imprace6   -0.640    0.403      -1.59  1.12e- 1
##  9 bmi           0.0825   0.00725    11.4   7.06e-30
## 10 bphigh_bin1   1.16     0.114      10.2   3.17e-24
## # ℹ 21 more rows

broom::tidy(m1_d2_base)

## # A tibble: 31 × 5
##    term        estimate std.error statistic  p.value
##    <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)  -7.42     0.384     -19.3   2.53e-82
##  2 age           0.0391   0.00347    11.3   1.91e-29
##  3 sex2         -0.342    0.0973     -3.51  4.49e- 4
##  4 x.imprace2    0.102    0.166       0.613 5.40e- 1
##  5 x.imprace3    0.912    0.391       2.33  1.97e- 2
##  6 x.imprace4    0.577    0.360       1.61  1.08e- 1
##  7 x.imprace5    0.625    0.125       5.00  5.73e- 7
##  8 x.imprace6   -0.689    0.400      -1.72  8.49e- 2
##  9 bmi           0.0823   0.00711    11.6   6.97e-31
## 10 bphigh_bin1   1.15     0.112      10.2   2.23e-24
## # ℹ 21 more rows

# MODEL + REGION FIXED EFFECTS

f_fe <- update(f_base, . ~ . + factor(x.region))

m2_cc_fe <- svyglm(f_fe, design = des_cc, family = quasibinomial())
m2_d1_fe <- svyglm(f_fe, design = des1, family = quasibinomial())
m2_d2_fe <- svyglm(f_fe, design = des2, family = quasibinomial())

# Controls for unobserved regional differences within Texas
broom::tidy(m2_cc_fe)

## # A tibble: 44 × 5
##    term        estimate std.error statistic  p.value
##    <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)  -7.64     0.477      -16.0  4.02e-57
##  2 age           0.0378   0.00429      8.82 1.37e-18
##  3 sex2         -0.296    0.112       -2.64 8.29e- 3
##  4 x.imprace2    0.213    0.192        1.11 2.68e- 1
##  5 x.imprace3    1.16     0.500        2.33 2.01e- 2
##  6 x.imprace4    0.874    0.407        2.15 3.18e- 2
##  7 x.imprace5    0.576    0.159        3.63 2.84e- 4
##  8 x.imprace6   -0.615    0.443       -1.39 1.65e- 1
##  9 bmi           0.0823   0.00810     10.2  3.51e-24
## 10 bphigh_bin1   1.16     0.130        8.95 4.24e-19
## # ℹ 34 more rows

broom::tidy(m2_d1_fe)

## # A tibble: 44 × 5
##    term        estimate std.error statistic  p.value
##    <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)  -7.46     0.404     -18.5   2.79e-75
##  2 age           0.0391   0.00349    11.2   5.20e-29
##  3 sex2         -0.343    0.0977     -3.51  4.51e- 4
##  4 x.imprace2    0.142    0.166       0.857 3.91e- 1
##  5 x.imprace3    0.913    0.416       2.20  2.82e- 2
##  6 x.imprace4    0.522    0.367       1.42  1.55e- 1
##  7 x.imprace5    0.579    0.135       4.28  1.89e- 5
##  8 x.imprace6   -0.647    0.395      -1.64  1.01e- 1
##  9 bmi           0.0818   0.00736    11.1   1.40e-28
## 10 bphigh_bin1   1.17     0.115      10.1   4.03e-24
## # ℹ 34 more rows

broom::tidy(m2_d2_fe)

## # A tibble: 44 × 5
##    term        estimate std.error statistic  p.value
##    <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)  -7.46     0.398     -18.7   1.66e-77
##  2 age           0.0391   0.00347    11.3   2.25e-29
##  3 sex2         -0.340    0.0972     -3.50  4.73e- 4
##  4 x.imprace2    0.122    0.167       0.730 4.65e- 1
##  5 x.imprace3    0.950    0.406       2.34  1.92e- 2
##  6 x.imprace4    0.552    0.364       1.52  1.30e- 1
##  7 x.imprace5    0.584    0.136       4.31  1.67e- 5
##  8 x.imprace6   -0.706    0.393      -1.80  7.22e- 2
##  9 bmi           0.0814   0.00719    11.3   1.29e-29
## 10 bphigh_bin1   1.16     0.113      10.2   2.37e-24
## # ℹ 34 more rows

# MODEL + FIXED EFFECTS + INTERACTIONS

f_fe_int <- update(
  f_fe,
  . ~ . +
    bmi:x.smoker3 +
    bmi:exer_bin +
    income2:hlthpln_bin +
    educa:hlthpln_bin
)

m3_cc_fe_int <- svyglm(f_fe_int, design = des_cc, family = quasibinomial())
m3_d1_fe_int <- svyglm(f_fe_int, design = des1, family = quasibinomial())
m3_d2_fe_int <- svyglm(f_fe_int, design = des2, family = quasibinomial())

# Tests whether key relationships differ across risk-factor combinations
broom::tidy(m3_cc_fe_int)

## # A tibble: 60 × 5
##    term        estimate std.error statistic  p.value
##    <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)  -8.26     0.888       -9.30 1.61e-20
##  2 age           0.0376   0.00429      8.75 2.39e-18
##  3 sex2         -0.305    0.112       -2.73 6.32e- 3
##  4 x.imprace2    0.230    0.188        1.22 2.22e- 1
##  5 x.imprace3    1.13     0.506        2.24 2.49e- 2
##  6 x.imprace4    0.876    0.403        2.17 2.98e- 2
##  7 x.imprace5    0.577    0.159        3.63 2.84e- 4
##  8 x.imprace6   -0.565    0.426       -1.32 1.86e- 1
##  9 bmi           0.101    0.0246       4.10 4.08e- 5
## 10 bphigh_bin1   1.16     0.130        8.92 5.52e-19
## # ℹ 50 more rows

broom::tidy(m3_d1_fe_int)

## # A tibble: 60 × 5
##    term        estimate std.error statistic  p.value
##    <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)  -8.11     0.798     -10.2   3.49e-24
##  2 age           0.0391   0.00351    11.2   8.48e-29
##  3 sex2         -0.352    0.0972     -3.62  2.91e- 4
##  4 x.imprace2    0.152    0.166       0.918 3.59e- 1
##  5 x.imprace3    0.888    0.418       2.13  3.35e- 2
##  6 x.imprace4    0.529    0.361       1.47  1.42e- 1
##  7 x.imprace5    0.575    0.135       4.26  2.08e- 5
##  8 x.imprace6   -0.619    0.382      -1.62  1.05e- 1
##  9 bmi           0.102    0.0227      4.49  7.07e- 6
## 10 bphigh_bin1   1.16     0.116      10.1   1.06e-23
## # ℹ 50 more rows

broom::tidy(m3_d2_fe_int)

## # A tibble: 60 × 5
##    term        estimate std.error statistic  p.value
##    <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)  -8.08     0.771     -10.5   1.28e-25
##  2 age           0.0389   0.00348    11.2   6.93e-29
##  3 sex2         -0.348    0.0971     -3.58  3.39e- 4
##  4 x.imprace2    0.138    0.167       0.825 4.09e- 1
##  5 x.imprace3    0.932    0.406       2.29  2.18e- 2
##  6 x.imprace4    0.563    0.361       1.56  1.19e- 1
##  7 x.imprace5    0.586    0.136       4.32  1.57e- 5
##  8 x.imprace6   -0.687    0.386      -1.78  7.52e- 2
##  9 bmi           0.102    0.0220      4.62  3.80e- 6
## 10 bphigh_bin1   1.16     0.114      10.1   4.60e-24
## # ℹ 50 more rows

# MODEL + FIXED EFFECTS + NONLINEAR TERMS

f_fe_nl <- update(
  f_fe,
  . ~ . +
    I(bmi^2) +
    I(age^2) +
    I(as.numeric(income2)^2)
)

m4_cc_fe_nl <- svyglm(f_fe_nl, design = des_cc, family = quasibinomial())
m4_d1_fe_nl <- svyglm(f_fe_nl, design = des1, family = quasibinomial())
m4_d2_fe_nl <- svyglm(f_fe_nl, design = des2, family = quasibinomial())

# Tests for curvature and threshold effects in continuous predictors
broom::tidy(m4_cc_fe_nl)

## # A tibble: 46 × 5
##    term        estimate std.error statistic  p.value
##    <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)  -13.3      1.14     -11.7   2.90e-31
##  2 age            0.173    0.0276     6.25  4.19e-10
##  3 sex2          -0.284    0.112     -2.54  1.10e- 2
##  4 x.imprace2     0.139    0.191      0.727 4.68e- 1
##  5 x.imprace3     1.18     0.507      2.32  2.03e- 2
##  6 x.imprace4     0.872    0.416      2.10  3.60e- 2
##  7 x.imprace5     0.507    0.158      3.22  1.28e- 3
##  8 x.imprace6    -0.556    0.414     -1.34  1.79e- 1
##  9 bmi            0.224    0.0412     5.43  5.64e- 8
## 10 bphigh_bin1    1.05     0.126      8.37  6.73e-17
## # ℹ 36 more rows

broom::tidy(m4_d1_fe_nl)

## # A tibble: 46 × 5
##    term        estimate std.error statistic  p.value
##    <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept) -13.6       0.970    -14.0   2.24e-44
##  2 age           0.185     0.0225     8.21  2.32e-16
##  3 sex2         -0.301     0.0977    -3.08  2.09e- 3
##  4 x.imprace2    0.0775    0.166      0.467 6.41e- 1
##  5 x.imprace3    0.961     0.415      2.32  2.05e- 2
##  6 x.imprace4    0.454     0.391      1.16  2.46e- 1
##  7 x.imprace5    0.504     0.136      3.71  2.11e- 4
##  8 x.imprace6   -0.630     0.372     -1.69  9.05e- 2
##  9 bmi           0.230     0.0364     6.33  2.55e-10
## 10 bphigh_bin1   1.05      0.112      9.40  6.31e-21
## # ℹ 36 more rows

broom::tidy(m4_d2_fe_nl)

## # A tibble: 46 × 5
##    term        estimate std.error statistic  p.value
##    <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept) -13.2       0.963    -13.7   9.90e-43
##  2 age           0.183     0.0225     8.14  4.26e-16
##  3 sex2         -0.302     0.0970    -3.12  1.83e- 3
##  4 x.imprace2    0.0643    0.167      0.384 7.01e- 1
##  5 x.imprace3    0.972     0.407      2.39  1.69e- 2
##  6 x.imprace4    0.493     0.387      1.28  2.02e- 1
##  7 x.imprace5    0.511     0.136      3.76  1.67e- 4
##  8 x.imprace6   -0.704     0.373     -1.89  5.92e- 2
##  9 bmi           0.211     0.0369     5.73  1.05e- 8
## 10 bphigh_bin1   1.04      0.110      9.43  4.93e-21
## # ℹ 36 more rows

# MODEL + FIXED EFFECTS + INTERACTIONS + NONLINEAR TERMS

f_full <- update(
  f_fe,
  . ~ . +
    bmi:x.smoker3 +
    bmi:exer_bin +
    income2:hlthpln_bin +
    educa:hlthpln_bin +
    I(bmi^2) +
    I(age^2) +
    I(as.numeric(income2)^2)
)

m5_cc_full <- svyglm(f_full, design = des_cc, family = quasibinomial())
m5_d1_full <- svyglm(f_full, design = des1, family = quasibinomial())
m5_d2_full <- svyglm(f_full, design = des2, family = quasibinomial())

# Stress-test specification to assess robustness and overfitting sensitivity
broom::tidy(m5_cc_full)

## # A tibble: 62 × 5
##    term        estimate std.error statistic  p.value
##    <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)  -14.3      1.43     -10.0   1.53e-23
##  2 age            0.172    0.0270     6.36  2.07e-10
##  3 sex2          -0.290    0.111     -2.61  9.03e- 3
##  4 x.imprace2     0.150    0.187      0.799 4.24e- 1
##  5 x.imprace3     1.15     0.513      2.25  2.45e- 2
##  6 x.imprace4     0.867    0.413      2.10  3.59e- 2
##  7 x.imprace5     0.510    0.157      3.25  1.16e- 3
##  8 x.imprace6    -0.508    0.407     -1.25  2.12e- 1
##  9 bmi            0.266    0.0504     5.27  1.36e- 7
## 10 bphigh_bin1    1.05     0.126      8.37  6.54e-17
## # ℹ 52 more rows

broom::tidy(m5_d1_full)

## # A tibble: 62 × 5
##    term        estimate std.error statistic  p.value
##    <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept) -14.2       1.22     -11.6   3.17e-31
##  2 age           0.187     0.0225     8.35  7.70e-17
##  3 sex2         -0.311     0.0971    -3.20  1.38e- 3
##  4 x.imprace2    0.0771    0.165      0.468 6.40e- 1
##  5 x.imprace3    0.946     0.416      2.27  2.31e- 2
##  6 x.imprace4    0.452     0.388      1.16  2.44e- 1
##  7 x.imprace5    0.504     0.134      3.75  1.77e- 4
##  8 x.imprace6   -0.620     0.372     -1.67  9.50e- 2
##  9 bmi           0.250     0.0436     5.74  9.80e- 9
## 10 bphigh_bin1   1.04      0.111      9.33  1.27e-20
## # ℹ 52 more rows

broom::tidy(m5_d2_full)

## # A tibble: 62 × 5
##    term        estimate std.error statistic  p.value
##    <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept) -14.0       1.20     -11.6   6.33e-31
##  2 age           0.185     0.0223     8.29  1.19e-16
##  3 sex2         -0.310     0.0966    -3.20  1.36e- 3
##  4 x.imprace2    0.0740    0.167      0.444 6.57e- 1
##  5 x.imprace3    0.958     0.408      2.35  1.88e- 2
##  6 x.imprace4    0.496     0.386      1.28  2.00e- 1
##  7 x.imprace5    0.514     0.135      3.82  1.37e- 4
##  8 x.imprace6   -0.694     0.375     -1.85  6.46e- 2
##  9 bmi           0.239     0.0448     5.34  9.21e- 8
## 10 bphigh_bin1   1.03      0.110      9.37  8.19e-21
## # ℹ 52 more rows

BRFSS 2011 Imputation of Diabetes Predictors Analysis

Ivan Jimenez- Acosta

2025-12-14