library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(MatchIt)
library(cluster)
library(ggplot2)
library(tidyr)
library(purrr)
df <- read_excel("C:\\Users\\Idjelov1\\OneDrive - Health Alliance Plan\\Documents\\propmatch\\rimportfinal.xlsx")
# covariates
covariates <- c("IP_UTIL", "SNF_UTIL", "AGE", "GENDER", "MARA")
df_clean <- df %>%
filter(!is.na(GENDER)) %>%
mutate(GENDER = ifelse(GENDER == "M", 1, #MALE = 1
ifelse(GENDER == "F", 0, NA))) %>% #FEMALE = 0
select(MEMBER, ENROLLED, all_of(covariates)) %>%
mutate(across(all_of(covariates), ~ replace_na(., 0)))
df_clean <- df_clean %>%
mutate(ENROLLED = ifelse(ENROLLED == "Y", 1,
ifelse(ENROLLED == "N", 0, NA))) %>%
mutate(ENROLLED = as.numeric(ENROLLED)) %>%
filter(!is.na(ENROLLED))
# scale covariates for clustering
scaled_data <- scale(df_clean[, c("IP_UTIL", "SNF_UTIL", "AGE", "MARA")])
# k-means clustering
set.seed(111)
kmeans_result <- kmeans(scaled_data, centers = 10)
df_clean$cluster <- kmeans_result$cluster
# function to perform matching within a cluster
match_within_cluster <- function(cluster_df) {
if (length(unique(cluster_df$ENROLLED)) < 2) return(NULL)
match_model <- matchit(ENROLLED ~ AGE + GENDER + SNF_UTIL + IP_UTIL + MARA,
data = cluster_df, method = "nearest")
match.data(match_model)
}
# matching within each cluster
matched_data_list <- df_clean %>%
group_split(cluster) %>%
map(match_within_cluster) %>%
compact
# combine all matched data
matched_data_all <- bind_rows(matched_data_list) %>%
rename(propensity_score = distance)
# ensure covariates are numeric before summarizing
matched_data_all <- matched_data_all %>%
mutate(across(all_of(covariates), as.numeric))
# compare outcomes between matched groups
matched_data_all %>%
group_by(ENROLLED) %>%
summarise(across(all_of(covariates), mean, na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(all_of(covariates), mean, na.rm = TRUE)`.
## ℹ In group 1: `ENROLLED = 0`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
## # A tibble: 2 × 6
## ENROLLED IP_UTIL SNF_UTIL AGE GENDER MARA
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 1.15 1.16 80.5 0.420 3.06
## 2 1 1.16 1.18 80.6 0.424 3.14
# (test) graph for AGE
ggplot(matched_data_all, aes(x = factor(ENROLLED), y = AGE)) +
geom_boxplot(fill = c("#F8766D", "#00BFC4")) +
labs(title = "AGE by PACS Enrollment", x = "PACS Enrolled", y = "AGE") +
theme_minimal()

# cluster distribution bar plot
ggplot(matched_data_all, aes(x = factor(cluster), fill = factor(ENROLLED))) +
geom_bar(position = "dodge") +
labs(title = "Cluster Distribution of Matched Members",
x = "Cluster", fill = "PACS Enrolled") +
theme_minimal()

library(cobalt)
## cobalt (Version 4.6.1, Build Date: 2025-08-20)
##
## Attaching package: 'cobalt'
## The following object is masked from 'package:MatchIt':
##
## lalonde
bal.tab(ENROLLED ~ AGE + GENDER + SNF_UTIL + IP_UTIL + MARA,
data = matched_data_all, method = "matching")
## Note: `s.d.denom` not specified; assuming "pooled".
## Balance Measures
## Type Diff.Un
## AGE Contin. 0.0051
## GENDER Binary 0.0038
## SNF_UTIL Contin. 0.0339
## IP_UTIL Contin. 0.0121
## MARA Contin. 0.0263
##
## Sample sizes
## Control Treated
## All 262 262
#histogram of propensity scores
ggplot(matched_data_all, aes(x = propensity_score, fill = factor(ENROLLED))) +
geom_histogram(position = "identity", alpha = 0.5, bins = 50) +
labs(title = "Propensity Score Distribution by PACS Enrollment",
x = "Propensity Score", fill = "PACS Enrolled") +
theme_minimal()

df_prop <- df_clean
ps_model <- glm(ENROLLED ~ AGE + GENDER + SNF_UTIL + IP_UTIL + MARA,
data = df_clean, family = binomial())
df_prop$propensity_score <- predict(ps_model, type = "response")
ggplot(df_prop, aes(x = propensity_score, fill = factor(ENROLLED))) +
geom_histogram(position = "identity", alpha = 0.5, bins = 50) +
labs(title = "Propensity Score Distribution by PACS Enrollment",
x = "Propensity Score", fill = "PACS Enrolled") +
theme_minimal()

covariates2 <- c("SNF_UTIL", "IP_UTIL", "AGE", "GENDER", "MARA")
for (var in covariates2) {
cat("\nT-test for", var, "\n")
print(t.test(df_clean[[var]] ~ df_clean$ENROLLED, conf.level = 0.95))
}
##
## T-test for SNF_UTIL
##
## Welch Two Sample t-test
##
## data: df_clean[[var]] by df_clean$ENROLLED
## t = -1.7982, df = 512.93, p-value = 0.07274
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.164277511 0.007267006
## sample estimates:
## mean in group 0 mean in group 1
## 1.104701 1.183206
##
##
## T-test for IP_UTIL
##
## Welch Two Sample t-test
##
## data: df_clean[[var]] by df_clean$ENROLLED
## t = -0.64334, df = 533.07, p-value = 0.5203
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.19074603 0.09663107
## sample estimates:
## mean in group 0 mean in group 1
## 1.113248 1.160305
##
##
## T-test for AGE
##
## Welch Two Sample t-test
##
## data: df_clean[[var]] by df_clean$ENROLLED
## t = -1.441, df = 583.34, p-value = 0.1501
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -2.2000291 0.3379556
## sample estimates:
## mean in group 0 mean in group 1
## 79.64530 80.57634
##
##
## T-test for GENDER
##
## Welch Two Sample t-test
##
## data: df_clean[[var]] by df_clean$ENROLLED
## t = -1.1416, df = 532.02, p-value = 0.2541
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.11787242 0.03122793
## sample estimates:
## mean in group 0 mean in group 1
## 0.3803419 0.4236641
##
##
## T-test for MARA
##
## Welch Two Sample t-test
##
## data: df_clean[[var]] by df_clean$ENROLLED
## t = 0.80345, df = 527.49, p-value = 0.4221
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.2890576 0.6891210
## sample estimates:
## mean in group 0 mean in group 1
## 3.342955 3.142924