# Global chunk options and required packages
knitr::opts_chunk$set(echo=TRUE, message=FALSE, warning=FALSE)

# Core data manipulation and survey-weighted estimation
options(repos = c(CRAN = "https://cloud.r-project.org"))  # set CRAN mirror for non-interactive knit
install.packages(c("tidyverse","survey","dplyr", "mice"), repos = getOption("repos"))  # ensure mirror is used
library(tidyverse)
library(survey)
library(dplyr)
library(mice)

# Handle strata with a single PSU after subsetting (e.g., Texas-only)
options(survey.lonely.psu = "adjust")
# Load BRFSS 2011 data (pre-processed to RDS)
BRFSS2011 <- readRDS("C:/Users/63650/Downloads/BRFSS2011.RDS")
# Restrict sample to Texas respondents (state FIPS = 48)
BRFSS2011_TX <- BRFSS2011 %>% filter(x.state == 48)
# Revised variable list (core predictors + access + diagnostics)
keep_2011 <- c(
  # Survey design (required, not predictors)
  "x.state","x.psu","x.ststr","x.llcpwt",

  # Outcome 
  "diabete3",

  # 1) Demographics
  "age",
  "sex",
  "x.imprace",

  # 2) Biological / cardiometabolic risk
  "x.bmi5",
  "bphigh4",
  "toldhi2",

  # 3) Behavioral risk
  "x.smoker3",
  "exerany2",
  "drnkany5",

  # 4) Socioeconomic position
  "educa",
  "income2",
  "employ",

  # 5) Access to care / detection
  "hlthpln1",
  "checkup1",
  "medcost",

  # 6) Geography / context (Texas-specific)
  "x.region",
  "ctycode1",
  "persdoc2"
)

# Keep only the analysis variables that actually exist in this year's file
df <- BRFSS2011_TX %>%
  select(all_of(intersect(keep_2011, names(BRFSS2011_TX))))

# Recode BRFSS non-substantive response codes to NA
# (DK/Not sure, Refused, and similar special codes)
# Marks entries missing for later handling (e.g., MICE)

# Diabetes source variable: drop DK/refused
df$diabete3[df$diabete3 %in% c(7,9)] <- NA

# Demographics
df$age[df$age %in% c(7, 9)] <- NA
# Restrict to adults
df <- df %>% filter(age >= 18)
# sex by default 1 for male, 2 for female
df$x.imprace[!(df$x.imprace %in% 1:6)] <- NA

# Cardiometabolic risk
df$bphigh4[df$bphigh4 %in% c(2,4,7,9)] <- NA # exclude pregnancy-only, borderline, DK/refused
df$toldhi2[df$toldhi2 %in% c(7,9)] <- NA

# Behavioral risk
df$x.smoker3[df$x.smoker3 == 9] <- NA
df$exerany2[df$exerany2 %in% c(7,9)] <- NA
df$drnkany5[df$drnkany5 %in% c(7,9)] <- NA

# Socioeconomic position
df$educa[df$educa == 9] <- NA
df$income2[df$income2 %in% c(77,99)] <- NA
df$employ[df$employ == 9] <- NA

# Access to care
df$hlthpln1[df$hlthpln1 %in% c(7,9)] <- NA
df$checkup1[df$checkup1 %in% c(7,9)] <- NA # keep 8 = "Never" as a valid category
df$medcost[df$medcost %in% c(7,9)] <- NA
df$persdoc2[df$persdoc2 %in% c(7,9)] <- NA

# Geography
# x.region has no NA
df$ctycode1[df$ctycode1 %in% c(777,999)] <- NA
df$persdoc2[df$persdoc2 %in% c(7,9)] <- NA

# BMI is computed; keep numeric and scale
df$x.bmi5 <- as.numeric(df$x.bmi5)
# Remove impossible values
df$x.bmi5[df$x.bmi5 %in% c(9999, 0)] <- NA
# Plausible analytic range (BMI 12–80)
df$x.bmi5[df$x.bmi5 < 1200 | df$x.bmi5 > 8000] <- NA
# Create scaled BMI in standard units for modeling
df$bmi <- df$x.bmi5 / 100
# Construct binary diagnosed diabetes outcome
# Keep: 1 = Yes, 3 = No
# Exclude: 2 = gestational only, 4 = prediabetes/borderline, plus missing
# Then drop missing outcome (do NOT impute outcome)

df <- df %>%
  mutate(diabetes_dx = case_when(
    diabete3 == 1 ~ 1,
    diabete3 == 3 ~ 0,
    TRUE ~ NA_real_
  )) %>%
  filter(!is.na(diabetes_dx))

# Set variable types for MICE
# Outcome stays as factor (NOT imputed)
# Numeric: age, bmi
# Binary indicators as factors (0/1)
# Ordered: educa, income2, checkup1
# Nominal multi-category: x.imprace, employ, x.smoker3, persdoc2, x.region, ctycode1

df$diabetes_dx <- factor(df$diabetes_dx, levels = c(0,1))

# Continuous
df$age <- as.numeric(df$age)
df$bmi <- as.numeric(df$bmi)

# High blood pressure: 1=Yes, 3=No  -> 1/0
df$bphigh_bin <- NA_integer_
df$bphigh_bin[df$bphigh4 == 1] <- 1
df$bphigh_bin[df$bphigh4 == 3] <- 0

# Told cholesterol high: 1=Yes, 2=No -> 1/0
df$toldhi_bin <- NA_integer_
df$toldhi_bin[df$toldhi2 == 1] <- 1
df$toldhi_bin[df$toldhi2 == 2] <- 0

# Exercise past 30 days: 1=Yes, 2=No -> 1/0
df$exer_bin <- NA_integer_
df$exer_bin[df$exerany2 == 1] <- 1
df$exer_bin[df$exerany2 == 2] <- 0

# Any alcohol past 30 days: 1=Yes, 2=No -> 1/0
df$drink_bin <- NA_integer_
df$drink_bin[df$drnkany5 == 1] <- 1
df$drink_bin[df$drnkany5 == 2] <- 0

# Health insurance coverage: 1=Yes, 2=No -> 1/0
df$hlthpln_bin <- NA_integer_
df$hlthpln_bin[df$hlthpln1 == 1] <- 1
df$hlthpln_bin[df$hlthpln1 == 2] <- 0

# Could not see doctor due to cost: 1=Yes, 2=No -> 1/0
df$medcost_bin <- NA_integer_
df$medcost_bin[df$medcost == 1] <- 1
df$medcost_bin[df$medcost == 2] <- 0

# Personal doctor/provider as binary:(1/2)=1, (3)=0 (any provider vs none)
df$persdoc_any_bin <- NA_integer_
df$persdoc_any_bin[df$persdoc2 %in% c(1, 2)] <- 1
df$persdoc_any_bin[df$persdoc2 == 3] <- 0

# Binary 0/1 predictors: factors so mice uses logistic regression
df$bphigh_bin       <- factor(df$bphigh_bin, levels = c(0, 1))
df$toldhi_bin       <- factor(df$toldhi_bin, levels = c(0, 1))
df$exer_bin         <- factor(df$exer_bin, levels = c(0, 1))
df$drink_bin        <- factor(df$drink_bin, levels = c(0, 1))
df$hlthpln_bin      <- factor(df$hlthpln_bin, levels = c(0, 1))
df$medcost_bin      <- factor(df$medcost_bin, levels = c(0, 1))
df$persdoc_any_bin  <- factor(df$persdoc_any_bin, levels = c(0, 1))

# Ordered / nominal variables (NOT forced to 1/0)
df$sex <- factor(df$sex, levels = c(1, 2))
df$educa   <- ordered(df$educa, levels = 1:6)
df$income2 <- ordered(df$income2, levels = 1:8)
df$employ  <- factor(df$employ, levels = 1:8)
df$x.imprace <- factor(df$x.imprace, levels = 1:6)
df$x.smoker3 <- factor(df$x.smoker3, levels = 1:4)

# Checkup timing (8 = "Never" as valid ordered category)
df$checkup1 <- ordered(df$checkup1, levels = c(1, 2, 3, 4, 8))

# Geography IDs (excluded from MICE, keep as factors for later)
df$x.region <- factor(df$x.region)
df$ctycode1 <- factor(df$ctycode1)

# Run MICE (exclude survey design vars + IDs from imputation)

# Let MICE automatically choose an imputation model (logistic, PMM, etc.) based on variable type
meth <- make.method(df)

# Create the matrix that controls which variables can predict missing values in others
pred <- make.predictorMatrix(df)

# DO NOT EVER IMPUTE THE OUTCOME (and do not use it to predict others)
meth["diabetes_dx"] <- ""
pred[, "diabetes_dx"] <- 0
pred["diabetes_dx", ] <- 0

# Force BMI to use predictive mean matching to avoid implausible imputed values
meth["bmi"]   <- "pmm"
meth["x.bmi5"] <- ""

# Prevent survey design variables and county identifiers from being imputed
meth[c("x.state","x.psu","x.ststr","x.llcpwt","ctycode1","x.region")] <- ""

# Do not impute original multi-code versions when you are using the 0/1 bins in the model
meth[c("bphigh4","toldhi2","exerany2","drnkany5","hlthpln1","medcost","persdoc2")] <- ""

# Impute the 0/1 binary predictors using logistic regression
meth[c("bphigh_bin","toldhi_bin","exer_bin","drink_bin","hlthpln_bin","medcost_bin","persdoc_any_bin")] <- "logreg"

# Exclude survey design variables and county from predicting other variables
# because they are identifiers, not substantive covariates
pred[, c("x.state","x.psu","x.ststr","x.llcpwt","ctycode1","x.region")] <- 0

# Exclude survey design variables and county from being predicted themselves
pred[c("x.state","x.psu","x.ststr","x.llcpwt","ctycode1","x.region"), ] <- 0

# Also remove the original multi-code versions as predictors/targets since we are imputing the bins instead
pred[, c("bphigh4","toldhi2","exerany2","drnkany5","hlthpln1","medcost","persdoc2","x.bmi5")] <- 0
pred[c("bphigh4","toldhi2","exerany2","drnkany5","hlthpln1","medcost","persdoc2","x.bmi5"), ] <- 0

# Run multiple imputation with a small number of datasets and iterations for feasibility
imp <- mice(df, m = 5, maxit = 2, method = meth, predictorMatrix = pred, seed = 123)

# Extract the completed imputed datasets for analysis
d1 <- complete(imp, 1)
d2 <- complete(imp, 2)
# Survey design using the cleaned (non-imputed) dataset
# Used as a baseline / complete-case comparison
des_cc <- svydesign(
  ids = ~x.psu,        # primary sampling units (clusters)
  strata = ~x.ststr,   # stratification variable
  weights = ~x.llcpwt,# BRFSS final survey weights
  data = df,
  nest = TRUE          # PSUs are nested within strata
)

# Survey design for imputed dataset 1
# Preserves BRFSS sampling structure after imputation
des1 <- svydesign(
  ids = ~x.psu,
  strata = ~x.ststr,
  weights = ~x.llcpwt,
  data = d1,
  nest = TRUE
)

# Survey design for imputed dataset 2
# Allows analysis and pooling across multiple imputations
des2 <- svydesign(
  ids = ~x.psu,
  strata = ~x.ststr,
  weights = ~x.llcpwt,
  data = d2,
  nest = TRUE
)
# BASELINE MODEL (no FE, no interactions)

f_base <- diabetes_dx ~
  age + sex + x.imprace +
  bmi + bphigh_bin +
  x.smoker3 + exer_bin +
  educa + income2 +
  hlthpln_bin + checkup1

# Complete case
m1_cc_base <- svyglm(f_base, design = des_cc, family = quasibinomial())

# Imputation 1
m1_d1_base <- svyglm(f_base, design = des1, family = quasibinomial())

# Imputation 2
m1_d2_base <- svyglm(f_base, design = des2, family = quasibinomial())

# Used to establish raw associational relationships
broom::tidy(m1_cc_base)
broom::tidy(m1_d1_base)
broom::tidy(m1_d2_base)

# MODEL + REGION FIXED EFFECTS

f_fe <- update(f_base, . ~ . + factor(x.region))

m2_cc_fe <- svyglm(f_fe, design = des_cc, family = quasibinomial())
m2_d1_fe <- svyglm(f_fe, design = des1, family = quasibinomial())
m2_d2_fe <- svyglm(f_fe, design = des2, family = quasibinomial())

# Controls for unobserved regional differences within Texas
broom::tidy(m2_cc_fe)
broom::tidy(m2_d1_fe)
broom::tidy(m2_d2_fe)

# MODEL + FIXED EFFECTS + INTERACTIONS

f_fe_int <- update(
  f_fe,
  . ~ . +
    bmi:x.smoker3 +
    bmi:exer_bin +
    income2:hlthpln_bin +
    educa:hlthpln_bin
)

m3_cc_fe_int <- svyglm(f_fe_int, design = des_cc, family = quasibinomial())
m3_d1_fe_int <- svyglm(f_fe_int, design = des1, family = quasibinomial())
m3_d2_fe_int <- svyglm(f_fe_int, design = des2, family = quasibinomial())

# Tests whether key relationships differ across risk-factor combinations
broom::tidy(m3_cc_fe_int)
broom::tidy(m3_d1_fe_int)
broom::tidy(m3_d2_fe_int)

# MODEL + FIXED EFFECTS + NONLINEAR TERMS

f_fe_nl <- update(
  f_fe,
  . ~ . +
    I(bmi^2) +
    I(age^2) +
    I(as.numeric(income2)^2)
)

m4_cc_fe_nl <- svyglm(f_fe_nl, design = des_cc, family = quasibinomial())
m4_d1_fe_nl <- svyglm(f_fe_nl, design = des1, family = quasibinomial())
m4_d2_fe_nl <- svyglm(f_fe_nl, design = des2, family = quasibinomial())

# Tests for curvature and threshold effects in continuous predictors
broom::tidy(m4_cc_fe_nl)
broom::tidy(m4_d1_fe_nl)
broom::tidy(m4_d2_fe_nl)

# MODEL + FIXED EFFECTS + INTERACTIONS + NONLINEAR TERMS

f_full <- update(
  f_fe,
  . ~ . +
    bmi:x.smoker3 +
    bmi:exer_bin +
    income2:hlthpln_bin +
    educa:hlthpln_bin +
    I(bmi^2) +
    I(age^2) +
    I(as.numeric(income2)^2)
)

m5_cc_full <- svyglm(f_full, design = des_cc, family = quasibinomial())
m5_d1_full <- svyglm(f_full, design = des1, family = quasibinomial())
m5_d2_full <- svyglm(f_full, design = des2, family = quasibinomial())

# Stress-test specification to assess robustness and overfitting sensitivity
broom::tidy(m5_cc_full)
broom::tidy(m5_d1_full)
broom::tidy(m5_d2_full)