library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(MatchIt)
library(cluster)
library(ggplot2)
library(tidyr)
library(purrr)


df <- read_excel("C:\\Users\\Idjelov1\\OneDrive - Health Alliance Plan\\Documents\\propmatch\\rimportfinal.xlsx")

# covariates
covariates <- c("IP_UTIL", "SNF_UTIL", "AGE", "GENDER", "MARA")


df_clean <- df %>%
   filter(!is.na(GENDER)) %>%
  mutate(GENDER = ifelse(GENDER == "M", 1, #MALE = 1 
                         ifelse(GENDER == "F", 0, NA))) %>% #FEMALE = 0
  select(MEMBER, ENROLLED, all_of(covariates)) %>%
  mutate(across(all_of(covariates), ~ replace_na(., 0)))

df_clean <- df_clean %>%
  mutate(ENROLLED = ifelse(ENROLLED == "Y", 1,
                           ifelse(ENROLLED == "N", 0, NA))) %>%
  mutate(ENROLLED = as.numeric(ENROLLED)) %>%
  filter(!is.na(ENROLLED))

# scale covariates for clustering
scaled_data <- scale(df_clean[, c("IP_UTIL", "SNF_UTIL", "AGE", "MARA")])

# k-means clustering
set.seed(111)
kmeans_result <- kmeans(scaled_data, centers = 10)
df_clean$cluster <- kmeans_result$cluster

# function to perform matching within a cluster
match_within_cluster <- function(cluster_df) {
  if (length(unique(cluster_df$ENROLLED)) < 2) return(NULL)
  
  match_model <- matchit(ENROLLED ~ AGE + GENDER + SNF_UTIL + IP_UTIL + MARA,
                         data = cluster_df, method = "nearest")

  
  match.data(match_model)
}

# matching within each cluster
matched_data_list <- df_clean %>%
  group_split(cluster) %>%
  map(match_within_cluster) %>%
  compact

# combine all matched data
matched_data_all <- bind_rows(matched_data_list) %>%
  rename(propensity_score = distance)

# ensure covariates are numeric before summarizing
matched_data_all <- matched_data_all %>%
  mutate(across(all_of(covariates), as.numeric))

# compare outcomes between matched groups
matched_data_all %>%
  group_by(ENROLLED) %>%
  summarise(across(all_of(covariates), mean, na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(all_of(covariates), mean, na.rm = TRUE)`.
## ℹ In group 1: `ENROLLED = 0`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))
## # A tibble: 2 × 6
##   ENROLLED IP_UTIL SNF_UTIL   AGE GENDER  MARA
##      <dbl>   <dbl>    <dbl> <dbl>  <dbl> <dbl>
## 1        0    1.15     1.16  80.5  0.420  3.06
## 2        1    1.16     1.18  80.6  0.424  3.14
# (test) graph for AGE
ggplot(matched_data_all, aes(x = factor(ENROLLED), y = AGE)) +
  geom_boxplot(fill = c("#F8766D", "#00BFC4")) +
  labs(title = "AGE by PACS Enrollment", x = "PACS Enrolled", y = "AGE") +
  theme_minimal()

# cluster distribution bar plot
ggplot(matched_data_all, aes(x = factor(cluster), fill = factor(ENROLLED))) +
  geom_bar(position = "dodge") +
  labs(title = "Cluster Distribution of Matched Members",
       x = "Cluster", fill = "PACS Enrolled") +
  theme_minimal()

library(cobalt)
##  cobalt (Version 4.6.1, Build Date: 2025-08-20)
## 
## Attaching package: 'cobalt'
## The following object is masked from 'package:MatchIt':
## 
##     lalonde
bal.tab(ENROLLED ~ AGE + GENDER + SNF_UTIL + IP_UTIL + MARA,
        data = matched_data_all, method = "matching")
## Note: `s.d.denom` not specified; assuming "pooled".
## Balance Measures
##             Type Diff.Un
## AGE      Contin.  0.0051
## GENDER    Binary  0.0038
## SNF_UTIL Contin.  0.0339
## IP_UTIL  Contin.  0.0121
## MARA     Contin.  0.0263
## 
## Sample sizes
##     Control Treated
## All     262     262
#histogram of propensity scores
ggplot(matched_data_all, aes(x = propensity_score, fill = factor(ENROLLED))) +
  geom_histogram(position = "identity", alpha = 0.5, bins = 50) +
  labs(title = "Propensity Score Distribution by PACS Enrollment",
       x = "Propensity Score", fill = "PACS Enrolled") +
  theme_minimal()

df_prop <- df_clean

ps_model <- glm(ENROLLED ~ AGE + GENDER + SNF_UTIL + IP_UTIL + MARA,
                data = df_clean, family = binomial())

df_prop$propensity_score <- predict(ps_model, type = "response")

ggplot(df_prop, aes(x = propensity_score, fill = factor(ENROLLED))) +
  geom_histogram(position = "identity", alpha = 0.5, bins = 50) +
  labs(title = "Propensity Score Distribution by PACS Enrollment",
       x = "Propensity Score", fill = "PACS Enrolled") +
  theme_minimal()

covariates2 <- c("SNF_UTIL", "IP_UTIL", "AGE", "GENDER", "MARA")

for (var in covariates2) {
  cat("\nT-test for", var, "\n")
  print(t.test(df_clean[[var]] ~ df_clean$ENROLLED, conf.level = 0.95))
}
## 
## T-test for SNF_UTIL 
## 
##  Welch Two Sample t-test
## 
## data:  df_clean[[var]] by df_clean$ENROLLED
## t = -1.7982, df = 512.93, p-value = 0.07274
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.164277511  0.007267006
## sample estimates:
## mean in group 0 mean in group 1 
##        1.104701        1.183206 
## 
## 
## T-test for IP_UTIL 
## 
##  Welch Two Sample t-test
## 
## data:  df_clean[[var]] by df_clean$ENROLLED
## t = -0.64334, df = 533.07, p-value = 0.5203
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.19074603  0.09663107
## sample estimates:
## mean in group 0 mean in group 1 
##        1.113248        1.160305 
## 
## 
## T-test for AGE 
## 
##  Welch Two Sample t-test
## 
## data:  df_clean[[var]] by df_clean$ENROLLED
## t = -1.441, df = 583.34, p-value = 0.1501
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -2.2000291  0.3379556
## sample estimates:
## mean in group 0 mean in group 1 
##        79.64530        80.57634 
## 
## 
## T-test for GENDER 
## 
##  Welch Two Sample t-test
## 
## data:  df_clean[[var]] by df_clean$ENROLLED
## t = -1.1416, df = 532.02, p-value = 0.2541
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.11787242  0.03122793
## sample estimates:
## mean in group 0 mean in group 1 
##       0.3803419       0.4236641 
## 
## 
## T-test for MARA 
## 
##  Welch Two Sample t-test
## 
## data:  df_clean[[var]] by df_clean$ENROLLED
## t = 0.80345, df = 527.49, p-value = 0.4221
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.2890576  0.6891210
## sample estimates:
## mean in group 0 mean in group 1 
##        3.342955        3.142924