The Whole R Script Running

############################################################
# 1. Load Data
############################################################

excel_file <- "diabetis_data.xlsx"

if (!file.exists(excel_file)) {
  excel_file <- list.files(pattern = "diabet.*\\.xlsx$|diabetes.*\\.xlsx$", ignore.case = TRUE)[1]
}

diabetic_raw <- read_excel(excel_file)

head(diabetic_raw)
## # A tibble: 6 × 50
##   encounter_id patient_nbr race            gender age   weight admission_type_id
##          <dbl>       <dbl> <chr>           <chr>  <chr> <chr>              <dbl>
## 1      2278392     8222157 Caucasian       Female [0-1… ?                      6
## 2       149190    55629189 Caucasian       Female [10-… ?                      1
## 3        64410    86047875 AfricanAmerican Female [20-… ?                      1
## 4       500364    82442376 Caucasian       Male   [30-… ?                      1
## 5        16680    42519267 Caucasian       Male   [40-… ?                      1
## 6        35754    82637451 Caucasian       Male   [50-… ?                      2
## # ℹ 43 more variables: discharge_disposition_id <dbl>,
## #   admission_source_id <dbl>, time_in_hospital <dbl>, payer_code <chr>,
## #   medical_specialty <chr>, num_lab_procedures <dbl>, num_procedures <dbl>,
## #   num_medications <dbl>, number_outpatient <dbl>, number_emergency <dbl>,
## #   number_inpatient <dbl>, diag_1 <chr>, diag_2 <chr>, diag_3 <chr>,
## #   number_diagnoses <dbl>, max_glu_serum <chr>, A1Cresult <chr>,
## #   metformin <chr>, repaglinide <chr>, nateglinide <chr>, …
dim(diabetic_raw)
## [1] 101766     50
names(diabetic_raw)
##  [1] "encounter_id"             "patient_nbr"             
##  [3] "race"                     "gender"                  
##  [5] "age"                      "weight"                  
##  [7] "admission_type_id"        "discharge_disposition_id"
##  [9] "admission_source_id"      "time_in_hospital"        
## [11] "payer_code"               "medical_specialty"       
## [13] "num_lab_procedures"       "num_procedures"          
## [15] "num_medications"          "number_outpatient"       
## [17] "number_emergency"         "number_inpatient"        
## [19] "diag_1"                   "diag_2"                  
## [21] "diag_3"                   "number_diagnoses"        
## [23] "max_glu_serum"            "A1Cresult"               
## [25] "metformin"                "repaglinide"             
## [27] "nateglinide"              "chlorpropamide"          
## [29] "glimepiride"              "acetohexamide"           
## [31] "glipizide"                "glyburide"               
## [33] "tolbutamide"              "pioglitazone"            
## [35] "rosiglitazone"            "acarbose"                
## [37] "miglitol"                 "troglitazone"            
## [39] "tolazamide"               "examide"                 
## [41] "citoglipton"              "insulin"                 
## [43] "glyburide-metformin"      "glipizide-metformin"     
## [45] "glimepiride-pioglitazone" "metformin-rosiglitazone" 
## [47] "metformin-pioglitazone"   "change"                  
## [49] "diabetesMed"              "readmitted"
str(diabetic_raw)
## tibble [101,766 × 50] (S3: tbl_df/tbl/data.frame)
##  $ encounter_id            : num [1:101766] 2278392 149190 64410 500364 16680 ...
##  $ patient_nbr             : num [1:101766] 8222157 55629189 86047875 82442376 42519267 ...
##  $ race                    : chr [1:101766] "Caucasian" "Caucasian" "AfricanAmerican" "Caucasian" ...
##  $ gender                  : chr [1:101766] "Female" "Female" "Female" "Male" ...
##  $ age                     : chr [1:101766] "[0-10)" "[10-20)" "[20-30)" "[30-40)" ...
##  $ weight                  : chr [1:101766] "?" "?" "?" "?" ...
##  $ admission_type_id       : num [1:101766] 6 1 1 1 1 2 3 1 2 3 ...
##  $ discharge_disposition_id: num [1:101766] 25 1 1 1 1 1 1 1 1 3 ...
##  $ admission_source_id     : num [1:101766] 1 7 7 7 7 2 2 7 4 4 ...
##  $ time_in_hospital        : num [1:101766] 1 3 2 2 1 3 4 5 13 12 ...
##  $ payer_code              : chr [1:101766] "?" "?" "?" "?" ...
##  $ medical_specialty       : chr [1:101766] "Pediatrics-Endocrinology" "?" "?" "?" ...
##  $ num_lab_procedures      : num [1:101766] 41 59 11 44 51 31 70 73 68 33 ...
##  $ num_procedures          : num [1:101766] 0 0 5 1 0 6 1 0 2 3 ...
##  $ num_medications         : num [1:101766] 1 18 13 16 8 16 21 12 28 18 ...
##  $ number_outpatient       : num [1:101766] 0 0 2 0 0 0 0 0 0 0 ...
##  $ number_emergency        : num [1:101766] 0 0 0 0 0 0 0 0 0 0 ...
##  $ number_inpatient        : num [1:101766] 0 0 1 0 0 0 0 0 0 0 ...
##  $ diag_1                  : chr [1:101766] "250.83" "276" "648" "8" ...
##  $ diag_2                  : chr [1:101766] "?" "250.01" "250" "250.43" ...
##  $ diag_3                  : chr [1:101766] "?" "255" "V27" "403" ...
##  $ number_diagnoses        : num [1:101766] 1 9 6 7 5 9 7 8 8 8 ...
##  $ max_glu_serum           : chr [1:101766] "None" "None" "None" "None" ...
##  $ A1Cresult               : chr [1:101766] "None" "None" "None" "None" ...
##  $ metformin               : chr [1:101766] "No" "No" "No" "No" ...
##  $ repaglinide             : chr [1:101766] "No" "No" "No" "No" ...
##  $ nateglinide             : chr [1:101766] "No" "No" "No" "No" ...
##  $ chlorpropamide          : chr [1:101766] "No" "No" "No" "No" ...
##  $ glimepiride             : chr [1:101766] "No" "No" "No" "No" ...
##  $ acetohexamide           : chr [1:101766] "No" "No" "No" "No" ...
##  $ glipizide               : chr [1:101766] "No" "No" "Steady" "No" ...
##  $ glyburide               : chr [1:101766] "No" "No" "No" "No" ...
##  $ tolbutamide             : chr [1:101766] "No" "No" "No" "No" ...
##  $ pioglitazone            : chr [1:101766] "No" "No" "No" "No" ...
##  $ rosiglitazone           : chr [1:101766] "No" "No" "No" "No" ...
##  $ acarbose                : chr [1:101766] "No" "No" "No" "No" ...
##  $ miglitol                : chr [1:101766] "No" "No" "No" "No" ...
##  $ troglitazone            : chr [1:101766] "No" "No" "No" "No" ...
##  $ tolazamide              : chr [1:101766] "No" "No" "No" "No" ...
##  $ examide                 : chr [1:101766] "No" "No" "No" "No" ...
##  $ citoglipton             : chr [1:101766] "No" "No" "No" "No" ...
##  $ insulin                 : chr [1:101766] "No" "Up" "No" "Up" ...
##  $ glyburide-metformin     : chr [1:101766] "No" "No" "No" "No" ...
##  $ glipizide-metformin     : chr [1:101766] "No" "No" "No" "No" ...
##  $ glimepiride-pioglitazone: chr [1:101766] "No" "No" "No" "No" ...
##  $ metformin-rosiglitazone : chr [1:101766] "No" "No" "No" "No" ...
##  $ metformin-pioglitazone  : chr [1:101766] "No" "No" "No" "No" ...
##  $ change                  : chr [1:101766] "No" "Ch" "No" "Ch" ...
##  $ diabetesMed             : chr [1:101766] "No" "Yes" "Yes" "Yes" ...
##  $ readmitted              : chr [1:101766] "NO" ">30" "NO" "NO" ...
############################################################
# 2. Lookup Tables
############################################################

admission_type <- tribble(
  ~admission_type_id, ~admission_type_desc,
  1, "Emergency",
  2, "Urgent",
  3, "Elective",
  4, "Newborn",
  5, "Not Available",
  6, "NULL",
  7, "Trauma Center",
  8, "Not Mapped"
)

discharge_disposition <- tribble(
  ~discharge_disposition_id, ~discharge_disposition_desc,
  1, "Discharged to home",
  2, "Discharged/transferred to another short term hospital",
  3, "Discharged/transferred to SNF",
  4, "Discharged/transferred to ICF",
  5, "Discharged/transferred to another type of inpatient care institution",
  6, "Discharged/transferred to home with home health service",
  7, "Left AMA",
  8, "Discharged/transferred to home under care of Home IV provider",
  9, "Admitted as an inpatient to this hospital",
  10, "Neonate discharged to another hospital for neonatal aftercare",
  11, "Expired",
  12, "Still patient or expected to return for outpatient services",
  13, "Hospice / home",
  14, "Hospice / medical facility",
  15, "Discharged/transferred within this institution to Medicare approved swing bed",
  16, "Discharged/transferred/referred another institution for outpatient services",
  17, "Discharged/transferred/referred to this institution for outpatient services",
  18, "NULL",
  19, "Expired at home. Medicaid only, hospice.",
  20, "Expired in a medical facility. Medicaid only, hospice.",
  21, "Expired, place unknown. Medicaid only, hospice.",
  22, "Discharged/transferred to another rehab fac including rehab units of a hospital",
  23, "Discharged/transferred to a long term care hospital",
  24, "Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare",
  25, "Not Mapped",
  26, "Unknown/Invalid",
  27, "Discharged/transferred to a federal health care facility",
  28, "Discharged/transferred/referred to a psychiatric hospital of psychiatric distinct part unit of a hospital",
  29, "Discharged/transferred to a Critical Access Hospital (CAH)",
  30, "Discharged/transferred to another Type of Health Care Institution not Defined Elsewhere"
)

admission_source <- tribble(
  ~admission_source_id, ~admission_source_desc,
  1, "Physician Referral",
  2, "Clinic Referral",
  3, "HMO Referral",
  4, "Transfer from a hospital",
  5, "Transfer from a Skilled Nursing Facility (SNF)",
  6, "Transfer from another health care facility",
  7, "Emergency Room",
  8, "Court/Law Enforcement",
  9, "Not Available",
  10, "Transfer from critical access hospital",
  11, "Normal Delivery",
  12, "Premature Delivery",
  13, "Sick Baby",
  14, "Extramural Birth",
  15, "Not Available",
  17, "NULL",
  18, "Transfer From Another Home Health Agency",
  19, "Readmission to Same Home Health Agency",
  20, "Not Mapped",
  21, "Unknown/Invalid",
  22, "Transfer from hospital inpatient/same facility result in a separate claim",
  23, "Born inside this hospital",
  24, "Born outside this hospital",
  25, "Transfer from Ambulatory Surgery Center",
  26, "Transfer from Hospice"
)

############################################################
# 3. Data Preparation
############################################################

diabetic <- diabetic_raw %>%
  mutate(across(where(is.character), ~ na_if(.x, "?"))) %>%
  left_join(admission_type, by = "admission_type_id") %>%
  left_join(discharge_disposition, by = "discharge_disposition_id") %>%
  left_join(admission_source, by = "admission_source_id") %>%
  mutate(
    Readmit_30 = ifelse(readmitted == "<30", "Readmitted_30", "Not_Readmitted"),
    Readmit_30 = factor(Readmit_30, levels = c("Not_Readmitted", "Readmitted_30")),
    readmitted_30_flag = ifelse(readmitted == "<30", 1, 0),
    admission_group = case_when(
      admission_type_desc %in% c("Emergency", "Urgent") ~ "Emergency_Urgent",
      admission_type_desc == "Elective" ~ "Elective",
      TRUE ~ "Other"
    ),
    discharge_group = case_when(
      discharge_disposition_id == 1 ~ "Home",
      discharge_disposition_id == 3 ~ "SNF",
      discharge_disposition_id == 6 ~ "Home_Health",
      discharge_disposition_id == 22 ~ "Rehab",
      discharge_disposition_id %in% c(11, 13, 14, 19, 20, 21) ~ "Expired_Hospice",
      TRUE ~ "Other"
    ),
    prior_utilization = number_outpatient + number_emergency + number_inpatient,
    prior_hospital_use = ifelse(number_inpatient > 0 | number_emergency > 0, "Yes", "No"),
    insulin_change = ifelse(insulin %in% c("Up", "Down"), "Changed", "Not_Changed"),
    medication_change = ifelse(change == "Ch", "Changed", "Not_Changed"),
    high_diagnosis_count = ifelse(number_diagnoses >= median(number_diagnoses, na.rm = TRUE), "High", "Low"),
    high_med_count = ifelse(num_medications >= median(num_medications, na.rm = TRUE), "High", "Low"),
    age = as.factor(age),
    race = as.factor(race),
    gender = as.factor(gender),
    admission_group = as.factor(admission_group),
    discharge_group = as.factor(discharge_group),
    admission_source_desc = as.factor(admission_source_desc),
    A1Cresult = as.factor(A1Cresult),
    insulin = as.factor(insulin),
    diabetesMed = as.factor(diabetesMed),
    medication_change = as.factor(medication_change),
    insulin_change = as.factor(insulin_change),
    prior_hospital_use = as.factor(prior_hospital_use),
    high_diagnosis_count = as.factor(high_diagnosis_count),
    high_med_count = as.factor(high_med_count)
  )

############################################################
# 4. Target Review and Basic EDA
############################################################

table(diabetic$Readmit_30)
## 
## Not_Readmitted  Readmitted_30 
##          90409          11357
prop.table(table(diabetic$Readmit_30))
## 
## Not_Readmitted  Readmitted_30 
##      0.8884008      0.1115992
readmission_summary <- diabetic %>%
  count(Readmit_30) %>%
  mutate(percent = round(n / sum(n) * 100, 2))

readmission_summary
## # A tibble: 2 × 3
##   Readmit_30         n percent
##   <fct>          <int>   <dbl>
## 1 Not_Readmitted 90409    88.8
## 2 Readmitted_30  11357    11.2
write_csv(readmission_summary, "readmission_target_summary.csv")

ggplot(diabetic, aes(x = Readmit_30)) +
  geom_bar(fill = "steelblue") +
  labs(
    title = "30-Day Readmission Target Distribution",
    x = "Readmission Status",
    y = "Number of Encounters"
  ) +
  theme_minimal()

ggsave("01_target_distribution.png", width = 7, height = 5)

eda_summary <- diabetic %>%
  group_by(Readmit_30) %>%
  summarise(
    n = n(),
    avg_time_in_hospital = round(mean(time_in_hospital, na.rm = TRUE), 2),
    avg_lab_procedures = round(mean(num_lab_procedures, na.rm = TRUE), 2),
    avg_medications = round(mean(num_medications, na.rm = TRUE), 2),
    avg_diagnoses = round(mean(number_diagnoses, na.rm = TRUE), 2),
    avg_outpatient = round(mean(number_outpatient, na.rm = TRUE), 2),
    avg_emergency = round(mean(number_emergency, na.rm = TRUE), 2),
    avg_inpatient = round(mean(number_inpatient, na.rm = TRUE), 2),
    .groups = "drop"
  )

eda_summary
## # A tibble: 2 × 9
##   Readmit_30         n avg_time_in_hospital avg_lab_procedures avg_medications
##   <fct>          <int>                <dbl>              <dbl>           <dbl>
## 1 Not_Readmitted 90409                 4.35               43.0            15.9
## 2 Readmitted_30  11357                 4.77               44.2            16.9
## # ℹ 4 more variables: avg_diagnoses <dbl>, avg_outpatient <dbl>,
## #   avg_emergency <dbl>, avg_inpatient <dbl>
write_csv(eda_summary, "eda_summary_by_readmission.csv")

ggplot(diabetic, aes(x = Readmit_30, y = number_inpatient)) +
  geom_boxplot() +
  labs(
    title = "Prior Inpatient Visits by Readmission Status",
    x = "Readmission Status",
    y = "Prior Inpatient Visits"
  ) +
  theme_minimal()

ggsave("02_prior_inpatient_by_readmission.png", width = 7, height = 5)

ggplot(diabetic, aes(x = insulin_change, fill = Readmit_30)) +
  geom_bar(position = "fill") +
  labs(
    title = "Readmission Status by Insulin Change",
    x = "Insulin Change",
    y = "Proportion",
    fill = "Readmission Status"
  ) +
  theme_minimal()

ggsave("03_insulin_change_readmission.png", width = 7, height = 5)

############################################################
# 5. Model-Ready Dataset
############################################################

model_data <- diabetic %>%
  filter(discharge_group != "Expired_Hospice") %>%
  select(
    Readmit_30,
    age,
    race,
    admission_group,
    discharge_group,
    admission_source_desc,
    time_in_hospital,
    num_lab_procedures,
    num_procedures,
    num_medications,
    number_outpatient,
    number_emergency,
    number_inpatient,
    prior_utilization,
    prior_hospital_use,
    number_diagnoses,
    A1Cresult,
    insulin,
    insulin_change,
    medication_change,
    diabetesMed,
    high_diagnosis_count,
    high_med_count
  ) %>%
  drop_na()

head(model_data)
## # A tibble: 6 × 23
##   Readmit_30   age   race  admission_group discharge_group admission_source_desc
##   <fct>        <fct> <fct> <fct>           <fct>           <fct>                
## 1 Not_Readmit… [0-1… Cauc… Other           Other           Physician Referral   
## 2 Not_Readmit… [10-… Cauc… Emergency_Urge… Home            Emergency Room       
## 3 Not_Readmit… [20-… Afri… Emergency_Urge… Home            Emergency Room       
## 4 Not_Readmit… [30-… Cauc… Emergency_Urge… Home            Emergency Room       
## 5 Not_Readmit… [40-… Cauc… Emergency_Urge… Home            Emergency Room       
## 6 Not_Readmit… [50-… Cauc… Emergency_Urge… Home            Clinic Referral      
## # ℹ 17 more variables: time_in_hospital <dbl>, num_lab_procedures <dbl>,
## #   num_procedures <dbl>, num_medications <dbl>, number_outpatient <dbl>,
## #   number_emergency <dbl>, number_inpatient <dbl>, prior_utilization <dbl>,
## #   prior_hospital_use <fct>, number_diagnoses <dbl>, A1Cresult <fct>,
## #   insulin <fct>, insulin_change <fct>, medication_change <fct>,
## #   diabetesMed <fct>, high_diagnosis_count <fct>, high_med_count <fct>
dim(model_data)
## [1] 97109    23
table(model_data$Readmit_30)
## 
## Not_Readmitted  Readmitted_30 
##          85983          11126
prop.table(table(model_data$Readmit_30))
## 
## Not_Readmitted  Readmitted_30 
##      0.8854277      0.1145723
write_csv(model_data, "diabetes_model_ready.csv")

############################################################
# 6. Train/Test Split
############################################################

model_data$Readmit_30 <- factor(
  as.character(model_data$Readmit_30),
  levels = c("Not_Readmitted", "Readmitted_30")
)

model_data <- model_data %>%
  mutate(
    across(
      .cols = c(
        age,
        race,
        admission_group,
        discharge_group,
        A1Cresult,
        insulin,
        insulin_change,
        medication_change,
        diabetesMed,
        high_diagnosis_count,
        high_med_count
      ),
      .fns = ~ fct_lump_min(., min = 20, other_level = "Other")
    )
  )

model_data$Readmit_30 <- droplevels(model_data$Readmit_30)

train_index <- createDataPartition(
  model_data$Readmit_30,
  p = 0.70,
  list = FALSE
)

train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]

factor_vars <- setdiff(
  names(train_data)[sapply(train_data, is.factor)],
  "Readmit_30"
)

for (v in factor_vars) {
  
  train_data[[v]] <- fct_expand(train_data[[v]], "Other")
  
  test_data[[v]] <- fct_other(
    test_data[[v]],
    keep = levels(train_data[[v]])[
      levels(train_data[[v]]) != "Other"
    ],
    other_level = "Other"
  )
  
  test_data[[v]] <- factor(
    test_data[[v]],
    levels = levels(train_data[[v]])
  )
}

prop.table(table(train_data$Readmit_30))
## 
## Not_Readmitted  Readmitted_30 
##      0.8854188      0.1145812
prop.table(table(test_data$Readmit_30))
## 
## Not_Readmitted  Readmitted_30 
##      0.8854485      0.1145515
############################################################
# 7. Baseline Accuracy
############################################################

majority_class <- names(which.max(table(train_data$Readmit_30)))
baseline_accuracy <- mean(test_data$Readmit_30 == majority_class)
baseline_accuracy
## [1] 0.8854485
############################################################
# 8. Logistic Regression Model
############################################################

logit_model <- glm(
  Readmit_30 ~
    age + race + admission_group + discharge_group +
    time_in_hospital + num_lab_procedures + num_procedures +
    num_medications + number_outpatient + number_emergency +
    number_inpatient + prior_utilization + prior_hospital_use +
    number_diagnoses + A1Cresult + insulin + insulin_change +
    medication_change + diabetesMed + high_diagnosis_count + high_med_count,
  data = train_data,
  family = binomial
)

summary(logit_model)
## 
## Call:
## glm(formula = Readmit_30 ~ age + race + admission_group + discharge_group + 
##     time_in_hospital + num_lab_procedures + num_procedures + 
##     num_medications + number_outpatient + number_emergency + 
##     number_inpatient + prior_utilization + prior_hospital_use + 
##     number_diagnoses + A1Cresult + insulin + insulin_change + 
##     medication_change + diabetesMed + high_diagnosis_count + 
##     high_med_count, family = binomial, data = train_data)
## 
## Coefficients: (2 not defined because of singularities)
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -4.5219144  0.7279291  -6.212 5.23e-10 ***
## age[10-20)                       0.9573203  0.7391770   1.295 0.195280    
## age[20-30)                       1.5559482  0.7207939   2.159 0.030877 *  
## age[30-40)                       1.3930218  0.7181915   1.940 0.052425 .  
## age[40-50)                       1.3472954  0.7167209   1.880 0.060135 .  
## age[50-60)                       1.2519629  0.7164167   1.748 0.080545 .  
## age[60-70)                       1.3578277  0.7162935   1.896 0.058009 .  
## age[70-80)                       1.3949600  0.7162728   1.948 0.051472 .  
## age[80-90)                       1.3800691  0.7165438   1.926 0.054103 .  
## age[90-100)                      1.3162757  0.7198196   1.829 0.067457 .  
## raceAsian                        0.2592958  0.1492476   1.737 0.082325 .  
## raceCaucasian                   -0.0039834  0.0321969  -0.124 0.901536    
## raceHispanic                    -0.0305772  0.0926311  -0.330 0.741327    
## raceOther                       -0.1235822  0.1111361  -1.112 0.266143    
## admission_groupEmergency_Urgent  0.0580347  0.0351588   1.651 0.098811 .  
## admission_groupOther             0.0264541  0.0499516   0.530 0.596394    
## discharge_groupHome_Health       0.1867381  0.0383211   4.873 1.10e-06 ***
## discharge_groupRehab             1.3345045  0.0644987  20.690  < 2e-16 ***
## discharge_groupSNF               0.3317891  0.0380310   8.724  < 2e-16 ***
## discharge_groupOther             0.4527593  0.0395776  11.440  < 2e-16 ***
## time_in_hospital                 0.0019630  0.0048158   0.408 0.683555    
## num_lab_procedures               0.0018166  0.0007193   2.526 0.011550 *  
## num_procedures                  -0.0125493  0.0084029  -1.493 0.135319    
## num_medications                  0.0017449  0.0025130   0.694 0.487458    
## number_outpatient               -0.0109355  0.0093223  -1.173 0.240776    
## number_emergency                 0.0218240  0.0103247   2.114 0.034534 *  
## number_inpatient                 0.2230864  0.0096190  23.192  < 2e-16 ***
## prior_utilization                       NA         NA      NA       NA    
## prior_hospital_useYes            0.2127560  0.0315337   6.747 1.51e-11 ***
## number_diagnoses                 0.0580651  0.0148632   3.907 9.36e-05 ***
## A1Cresult>8                      0.0755534  0.0810859   0.932 0.351456    
## A1CresultNone                    0.1359545  0.0688742   1.974 0.048387 *  
## A1CresultNorm                   -0.0017525  0.0892279  -0.020 0.984330    
## insulinNo                       -0.2923995  0.0483304  -6.050 1.45e-09 ***
## insulinSteady                   -0.2216333  0.0440672  -5.029 4.92e-07 ***
## insulinUp                       -0.1269756  0.0474863  -2.674 0.007497 ** 
## insulin_changeNot_Changed               NA         NA      NA       NA    
## medication_changeNot_Changed     0.1391060  0.0350277   3.971 7.15e-05 ***
## diabetesMedYes                   0.1504966  0.0398320   3.778 0.000158 ***
## high_diagnosis_countLow          0.0634175  0.0547461   1.158 0.246703    
## high_med_countLow               -0.0681849  0.0362036  -1.883 0.059650 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 48399  on 67977  degrees of freedom
## Residual deviance: 46125  on 67939  degrees of freedom
## AIC: 46203
## 
## Number of Fisher Scoring iterations: 6
test_data <- test_data %>%
  mutate(
    logit_prob = predict(logit_model, newdata = test_data, type = "response"),
    logit_pred_50 = ifelse(logit_prob >= 0.50, "Readmitted_30", "Not_Readmitted"),
    logit_pred_50 = factor(logit_pred_50, levels = levels(Readmit_30)),
    logit_pred_15 = ifelse(logit_prob >= 0.15, "Readmitted_30", "Not_Readmitted"),
    logit_pred_15 = factor(logit_pred_15, levels = levels(Readmit_30))
  )

logit_accuracy_50 <- mean(test_data$logit_pred_50 == test_data$Readmit_30)
logit_accuracy_15 <- mean(test_data$logit_pred_15 == test_data$Readmit_30)

logit_accuracy_50
## [1] 0.8853798
logit_accuracy_15
## [1] 0.7843191
confusionMatrix(
  data = test_data$logit_pred_50,
  reference = test_data$Readmit_30,
  positive = "Readmitted_30"
)
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       Not_Readmitted Readmitted_30
##   Not_Readmitted          25752          3297
##   Readmitted_30              42            40
##                                          
##                Accuracy : 0.8854         
##                  95% CI : (0.8817, 0.889)
##     No Information Rate : 0.8854         
##     P-Value [Acc > NIR] : 0.5193         
##                                          
##                   Kappa : 0.018          
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.011987       
##             Specificity : 0.998372       
##          Pos Pred Value : 0.487805       
##          Neg Pred Value : 0.886502       
##              Prevalence : 0.114552       
##          Detection Rate : 0.001373       
##    Detection Prevalence : 0.002815       
##       Balanced Accuracy : 0.505179       
##                                          
##        'Positive' Class : Readmitted_30  
## 
confusionMatrix(
  data = test_data$logit_pred_15,
  reference = test_data$Readmit_30,
  positive = "Readmitted_30"
)
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       Not_Readmitted Readmitted_30
##   Not_Readmitted          21672          2161
##   Readmitted_30            4122          1176
##                                          
##                Accuracy : 0.7843         
##                  95% CI : (0.7796, 0.789)
##     No Information Rate : 0.8854         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.1534         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.35241        
##             Specificity : 0.84020        
##          Pos Pred Value : 0.22197        
##          Neg Pred Value : 0.90933        
##              Prevalence : 0.11455        
##          Detection Rate : 0.04037        
##    Detection Prevalence : 0.18187        
##       Balanced Accuracy : 0.59630        
##                                          
##        'Positive' Class : Readmitted_30  
## 
logit_roc <- roc(
  response = test_data$Readmit_30,
  predictor = test_data$logit_prob,
  levels = c("Not_Readmitted", "Readmitted_30")
)
## Setting direction: controls < cases
logit_auc <- auc(logit_roc)
logit_auc
## Area under the curve: 0.6604
############################################################
# 9. Decision Tree Model
############################################################

tree_model <- rpart(
  Readmit_30 ~
    age + race + admission_group + discharge_group +
    time_in_hospital + num_lab_procedures + num_procedures +
    num_medications + number_outpatient + number_emergency +
    number_inpatient + prior_hospital_use +
    number_diagnoses + A1Cresult + insulin +
    medication_change + diabetesMed + high_diagnosis_count + high_med_count,
  data = train_data,
  method = "class",
  parms = list(prior = c(0.50, 0.50)),
  control = rpart.control(cp = 0.0001, minsplit = 50, maxdepth = 5)
)

print(tree_model)
## n= 67978 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 67978 33989.0000 Readmitted_30 (0.5000000 0.5000000)  
##    2) number_inpatient< 1.5 58199 24655.0100 Not_Readmitted (0.5461962 0.4538038)  
##      4) discharge_group=Home 36185 12196.5900 Not_Readmitted (0.6072214 0.3927786)  
##        8) prior_hospital_use=No 26831  7758.6910 Not_Readmitted (0.6458227 0.3541773) *
##        9) prior_hospital_use=Yes 9354  4437.9010 Not_Readmitted (0.5147630 0.4852370)  
##         18) number_diagnoses< 7.5 3842  1531.6650 Not_Readmitted (0.5627623 0.4372377) *
##         19) number_diagnoses>=7.5 5512  2736.5580 Readmitted_30 (0.4849651 0.5150349)  
##           38) num_medications< 17.5 3375  1623.3030 Not_Readmitted (0.5109223 0.4890777) *
##           39) num_medications>=17.5 2137  1040.7500 Readmitted_30 (0.4478882 0.5521118) *
##      5) discharge_group=Home_Health,Rehab,SNF,Other 22014 10819.1700 Readmitted_30 (0.4647893 0.5352107)  
##       10) discharge_group=Home_Health,SNF,Other 20814 10325.6200 Readmitted_30 (0.4833761 0.5166239)  
##         20) prior_hospital_use=No 14189  6859.7650 Not_Readmitted (0.5094787 0.4905213) *
##         21) prior_hospital_use=Yes 6625  3200.7450 Readmitted_30 (0.4338921 0.5661079) *
##       11) discharge_group=Rehab 1200   493.5517 Readmitted_30 (0.2575782 0.7424218) *
##    3) number_inpatient>=1.5 9779  4314.3420 Readmitted_30 (0.3161076 0.6838924) *
printcp(tree_model)
## 
## Classification tree:
## rpart(formula = Readmit_30 ~ age + race + admission_group + discharge_group + 
##     time_in_hospital + num_lab_procedures + num_procedures + 
##     num_medications + number_outpatient + number_emergency + 
##     number_inpatient + prior_hospital_use + number_diagnoses + 
##     A1Cresult + insulin + medication_change + diabetesMed + high_diagnosis_count + 
##     high_med_count, data = train_data, method = "class", parms = list(prior = c(0.5, 
##     0.5)), control = rpart.control(cp = 1e-04, minsplit = 50, 
##     maxdepth = 5))
## 
## Variables actually used in tree construction:
## [1] discharge_group    num_medications    number_diagnoses   number_inpatient  
## [5] prior_hospital_use
## 
## Root node error: 33989/67978 = 0.5
## 
## n= 67978 
## 
##          CP nsplit rel error  xerror      xstd
## 1 0.1476846      0   1.00000 1.01659 0.0069309
## 2 0.0482286      1   0.85232 0.85232 0.0091952
## 3 0.0039000      2   0.80409 0.80541 0.0069028
## 4 0.0024961      4   0.79629 0.80438 0.0071865
## 5 0.0021332      6   0.79129 0.79705 0.0074642
## 6 0.0001000      7   0.78916 0.79663 0.0073978
png("04_decision_tree.png", width = 1200, height = 800)
plot(tree_model, uniform = TRUE, margin = 0.1)
text(tree_model, use.n = TRUE, cex = 0.7)
dev.off()
## quartz_off_screen 
##                 2
tree_prob_matrix <- predict(tree_model, newdata = test_data, type = "prob")
tree_prob <- as.numeric(tree_prob_matrix[, "Readmitted_30"])

test_data <- test_data %>%
  mutate(
    tree_prob = tree_prob,
    tree_pred_50 = ifelse(tree_prob >= 0.50, "Readmitted_30", "Not_Readmitted"),
    tree_pred_50 = factor(tree_pred_50, levels = levels(Readmit_30)),
    tree_pred_15 = ifelse(tree_prob >= 0.15, "Readmitted_30", "Not_Readmitted"),
    tree_pred_15 = factor(tree_pred_15, levels = levels(Readmit_30))
  )

tree_accuracy_50 <- mean(test_data$tree_pred_50 == test_data$Readmit_30)
tree_accuracy_15 <- mean(test_data$tree_pred_15 == test_data$Readmit_30)

tree_accuracy_50
## [1] 0.7059147
tree_accuracy_15
## [1] 0.1145515
confusionMatrix(
  data = test_data$tree_pred_50,
  reference = test_data$Readmit_30,
  positive = "Readmitted_30"
)
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       Not_Readmitted Readmitted_30
##   Not_Readmitted          18919          1692
##   Readmitted_30            6875          1645
##                                           
##                Accuracy : 0.7059          
##                  95% CI : (0.7006, 0.7111)
##     No Information Rate : 0.8854          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1351          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.49296         
##             Specificity : 0.73347         
##          Pos Pred Value : 0.19308         
##          Neg Pred Value : 0.91791         
##              Prevalence : 0.11455         
##          Detection Rate : 0.05647         
##    Detection Prevalence : 0.29247         
##       Balanced Accuracy : 0.61321         
##                                           
##        'Positive' Class : Readmitted_30   
## 
confusionMatrix(
  data = test_data$tree_pred_15,
  reference = test_data$Readmit_30,
  positive = "Readmitted_30"
)
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       Not_Readmitted Readmitted_30
##   Not_Readmitted              0             0
##   Readmitted_30           25794          3337
##                                           
##                Accuracy : 0.1146          
##                  95% CI : (0.1109, 0.1183)
##     No Information Rate : 0.8854          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.1146          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.1146          
##          Detection Rate : 0.1146          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : Readmitted_30   
## 
tree_roc <- roc(
  response = test_data$Readmit_30,
  predictor = test_data$tree_prob,
  levels = c("Not_Readmitted", "Readmitted_30")
)
## Setting direction: controls < cases
tree_auc <- auc(tree_roc)
tree_auc
## Area under the curve: 0.6494
############################################################
# 10. Random Forest Model
############################################################

rf_train_data <- train_data

if (nrow(rf_train_data) > 25000) {
  rf_index <- createDataPartition(
    rf_train_data$Readmit_30,
    p = 25000 / nrow(rf_train_data),
    list = FALSE
  )
  rf_train_data <- rf_train_data[rf_index, ]
}

rf_model <- randomForest(
  Readmit_30 ~
    age + race + admission_group + discharge_group +
    time_in_hospital + num_lab_procedures + num_procedures +
    num_medications + number_outpatient + number_emergency +
    number_inpatient + prior_utilization + prior_hospital_use +
    number_diagnoses + A1Cresult + insulin + insulin_change +
    medication_change + diabetesMed + high_diagnosis_count + high_med_count,
  data = rf_train_data,
  ntree = 300,
  importance = TRUE
)

rf_model
## 
## Call:
##  randomForest(formula = Readmit_30 ~ age + race + admission_group +      discharge_group + time_in_hospital + num_lab_procedures +      num_procedures + num_medications + number_outpatient + number_emergency +      number_inpatient + prior_utilization + prior_hospital_use +      number_diagnoses + A1Cresult + insulin + insulin_change +      medication_change + diabetesMed + high_diagnosis_count +      high_med_count, data = rf_train_data, ntree = 300, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 300
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 11.49%
## Confusion matrix:
##                Not_Readmitted Readmitted_30 class.error
## Not_Readmitted          22086            50 0.002258764
## Readmitted_30            2823            42 0.985340314
rf_prob_matrix <- predict(rf_model, newdata = test_data, type = "prob")
rf_prob <- as.numeric(rf_prob_matrix[, "Readmitted_30"])

test_data <- test_data %>%
  mutate(
    rf_prob = rf_prob,
    rf_pred_50 = ifelse(rf_prob >= 0.50, "Readmitted_30", "Not_Readmitted"),
    rf_pred_50 = factor(rf_pred_50, levels = levels(Readmit_30)),
    rf_pred_15 = ifelse(rf_prob >= 0.15, "Readmitted_30", "Not_Readmitted"),
    rf_pred_15 = factor(rf_pred_15, levels = levels(Readmit_30))
  )

rf_accuracy_50 <- mean(test_data$rf_pred_50 == test_data$Readmit_30)
rf_accuracy_15 <- mean(test_data$rf_pred_15 == test_data$Readmit_30)

rf_accuracy_50
## [1] 0.8858261
rf_accuracy_15
## [1] 0.7004909
confusionMatrix(
  data = test_data$rf_pred_50,
  reference = test_data$Readmit_30,
  positive = "Readmitted_30"
)
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       Not_Readmitted Readmitted_30
##   Not_Readmitted          25755          3287
##   Readmitted_30              39            50
##                                           
##                Accuracy : 0.8858          
##                  95% CI : (0.8821, 0.8895)
##     No Information Rate : 0.8854          
##     P-Value [Acc > NIR] : 0.4243          
##                                           
##                   Kappa : 0.0234          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.014984        
##             Specificity : 0.998488        
##          Pos Pred Value : 0.561798        
##          Neg Pred Value : 0.886819        
##              Prevalence : 0.114552        
##          Detection Rate : 0.001716        
##    Detection Prevalence : 0.003055        
##       Balanced Accuracy : 0.506736        
##                                           
##        'Positive' Class : Readmitted_30   
## 
confusionMatrix(
  data = test_data$rf_pred_15,
  reference = test_data$Readmit_30,
  positive = "Readmitted_30"
)
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       Not_Readmitted Readmitted_30
##   Not_Readmitted          18909          1840
##   Readmitted_30            6885          1497
##                                           
##                Accuracy : 0.7005          
##                  95% CI : (0.6952, 0.7057)
##     No Information Rate : 0.8854          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1096          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.44861         
##             Specificity : 0.73308         
##          Pos Pred Value : 0.17860         
##          Neg Pred Value : 0.91132         
##              Prevalence : 0.11455         
##          Detection Rate : 0.05139         
##    Detection Prevalence : 0.28773         
##       Balanced Accuracy : 0.59084         
##                                           
##        'Positive' Class : Readmitted_30   
## 
rf_roc <- roc(
  response = test_data$Readmit_30,
  predictor = test_data$rf_prob,
  levels = c("Not_Readmitted", "Readmitted_30")
)
## Setting direction: controls < cases
rf_auc <- auc(rf_roc)
rf_auc
## Area under the curve: 0.6332
importance_table <- as.data.frame(importance(rf_model)) %>%
  rownames_to_column("Variable") %>%
  arrange(desc(MeanDecreaseGini))

importance_table
##                Variable Not_Readmitted Readmitted_30 MeanDecreaseAccuracy
## 1    num_lab_procedures      23.285749    -7.0026099            19.804641
## 2       num_medications      28.244949   -10.8445947            24.209542
## 3      time_in_hospital      27.022479    -7.0203465            24.892091
## 4                   age      17.456273    -5.5285031            14.850999
## 5        num_procedures      17.627735    -4.3878906            15.147771
## 6       discharge_group      25.691344    10.5893495            28.604706
## 7      number_diagnoses      19.558166    -5.7847059            17.374685
## 8     prior_utilization      10.354183     8.4006557            13.519617
## 9               insulin       7.864688    -1.1703806             7.323258
## 10     number_inpatient       5.874477    17.8806779            10.472016
## 11                 race       5.660397    -1.9601419             4.677275
## 12      admission_group      17.155422    -5.2218277            14.743514
## 13            A1Cresult       4.338330     2.2907170             4.976087
## 14    number_outpatient      17.826535    -4.9358317            16.584679
## 15     number_emergency       9.496486    -2.3185172             8.735020
## 16    medication_change       8.737793    -3.3757848             7.351964
## 17          diabetesMed       2.198277    -0.5577732             1.922025
## 18       high_med_count      15.889228    -7.5725669            14.927081
## 19 high_diagnosis_count      13.508832    -7.6961569            12.024778
## 20       insulin_change       2.982510     0.8371765             3.107998
## 21   prior_hospital_use       2.704876     9.0583866             6.058677
##    MeanDecreaseGini
## 1         715.74316
## 2         575.54567
## 3         425.95849
## 4         404.80839
## 5         296.56759
## 6         254.38006
## 7         228.86677
## 8         188.87266
## 9         187.35124
## 10        171.52202
## 11        151.91194
## 12        141.11304
## 13        131.59531
## 14        116.85064
## 15         87.11908
## 16         86.44838
## 17         64.80766
## 18         60.63315
## 19         49.36050
## 20         47.79434
## 21         34.55936
write_csv(importance_table, "random_forest_variable_importance.csv")

ggplot(importance_table %>% slice_max(MeanDecreaseGini, n = 12),
       aes(x = reorder(Variable, MeanDecreaseGini), y = MeanDecreaseGini)) +
  geom_col(fill = "darkgreen") +
  coord_flip() +
  labs(
    title = "Top Random Forest Predictors",
    x = "Predictor",
    y = "Importance"
  ) +
  theme_minimal()

ggsave("05_random_forest_variable_importance.png", width = 8, height = 6)

############################################################
# 11. kNN Model
############################################################

knn_data <- model_data %>%
  select(
    Readmit_30,
    time_in_hospital,
    num_lab_procedures,
    num_procedures,
    num_medications,
    number_outpatient,
    number_emergency,
    number_inpatient,
    prior_utilization,
    number_diagnoses
  )

knn_train <- knn_data[train_index, ]
knn_test <- knn_data[-train_index, ]

train_control <- trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE
)

knn_grid <- expand.grid(k = seq(3, 21, by = 2))

knn_model <- train(
  Readmit_30 ~ .,
  data = knn_train,
  method = "knn",
  trControl = train_control,
  preProcess = c("center", "scale"),
  tuneGrid = knn_grid
)

knn_model
## k-Nearest Neighbors 
## 
## 67978 samples
##     9 predictor
##     2 classes: 'Not_Readmitted', 'Readmitted_30' 
## 
## Pre-processing: centered (9), scaled (9) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 54382, 54382, 54383, 54383, 54382 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa     
##    3  0.8592486  0.04221908
##    5  0.8749448  0.03787087
##    7  0.8801230  0.03086136
##    9  0.8826679  0.02728957
##   11  0.8840360  0.02590649
##   13  0.8843744  0.02119723
##   15  0.8846097  0.01961922
##   17  0.8850216  0.02141458
##   19  0.8851687  0.01925817
##   21  0.8853011  0.01858526
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 21.
knn_model$bestTune
##     k
## 10 21
knn_pred <- predict(knn_model, newdata = knn_test)

knn_prob <- predict(knn_model, newdata = knn_test, type = "prob")[, "Readmitted_30"]

cm_knn <- confusionMatrix(
  data = knn_pred,
  reference = knn_test$Readmit_30,
  positive = "Readmitted_30"
)

cm_knn
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       Not_Readmitted Readmitted_30
##   Not_Readmitted          25746          3293
##   Readmitted_30              48            44
##                                           
##                Accuracy : 0.8853          
##                  95% CI : (0.8816, 0.8889)
##     No Information Rate : 0.8854          
##     P-Value [Acc > NIR] : 0.5339          
##                                           
##                   Kappa : 0.0196          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.013185        
##             Specificity : 0.998139        
##          Pos Pred Value : 0.478261        
##          Neg Pred Value : 0.886601        
##              Prevalence : 0.114552        
##          Detection Rate : 0.001510        
##    Detection Prevalence : 0.003158        
##       Balanced Accuracy : 0.505662        
##                                           
##        'Positive' Class : Readmitted_30   
## 
knn_roc <- roc(
  response = knn_test$Readmit_30,
  predictor = knn_prob,
  levels = c("Not_Readmitted", "Readmitted_30")
)
## Setting direction: controls < cases
knn_auc <- auc(knn_roc)
knn_auc
## Area under the curve: 0.5915
############################################################
# 12. ROC Curve Comparison
############################################################

png("06_roc_curve_comparison.png", width = 900, height = 700)

plot(
  logit_roc,
  col = "blue",
  lwd = 3,
  main = "ROC Curve Comparison Across Models",
  legacy.axes = TRUE
)

plot(
  tree_roc,
  col = "red",
  lwd = 3,
  add = TRUE
)

plot(
  rf_roc,
  col = "darkgreen",
  lwd = 3,
  add = TRUE
)

plot(
  knn_roc,
  col = "purple",
  lwd = 3,
  add = TRUE
)

abline(a = 0, b = 1, lty = 2, col = "gray50")

legend(
  "bottomright",
  legend = c(
    paste("Logistic Regression AUC =", round(logit_auc, 3)),
    paste("Decision Tree AUC =", round(tree_auc, 3)),
    paste("Random Forest AUC =", round(rf_auc, 3)),
    paste("KNN AUC =", round(knn_auc, 3))
  ),
  col = c("blue", "red", "darkgreen", "purple"),
  lwd = 3,
  cex = 0.9
)

dev.off()
## quartz_off_screen 
##                 2
############################################################
# 13. Model Comparison Tables
############################################################

baseline_pred <- factor(
  rep("Not_Readmitted", nrow(test_data)),
  levels = levels(test_data$Readmit_30)
)

cm_baseline <- confusionMatrix(
  data = baseline_pred,
  reference = test_data$Readmit_30,
  positive = "Readmitted_30"
)

cm_logit_50 <- confusionMatrix(
  test_data$logit_pred_50,
  test_data$Readmit_30,
  positive = "Readmitted_30"
)

cm_logit_15 <- confusionMatrix(
  test_data$logit_pred_15,
  test_data$Readmit_30,
  positive = "Readmitted_30"
)

cm_tree_50 <- confusionMatrix(
  test_data$tree_pred_50,
  test_data$Readmit_30,
  positive = "Readmitted_30"
)

cm_tree_15 <- confusionMatrix(
  test_data$tree_pred_15,
  test_data$Readmit_30,
  positive = "Readmitted_30"
)

cm_rf_50 <- confusionMatrix(
  test_data$rf_pred_50,
  test_data$Readmit_30,
  positive = "Readmitted_30"
)

cm_rf_15 <- confusionMatrix(
  test_data$rf_pred_15,
  test_data$Readmit_30,
  positive = "Readmitted_30"
)

model_comparison <- tibble(
  Model = c(
    "Majority Class Baseline",
    "Logistic Regression - 0.50 Threshold",
    "Logistic Regression - 0.15 Threshold",
    "Decision Tree - 0.50 Threshold",
    "Decision Tree - 0.15 Threshold",
    "Random Forest - 0.50 Threshold",
    "Random Forest - 0.15 Threshold",
    "KNN Model"
  ),
  
  Accuracy = c(
    cm_baseline$overall["Accuracy"],
    cm_logit_50$overall["Accuracy"],
    cm_logit_15$overall["Accuracy"],
    cm_tree_50$overall["Accuracy"],
    cm_tree_15$overall["Accuracy"],
    cm_rf_50$overall["Accuracy"],
    cm_rf_15$overall["Accuracy"],
    cm_knn$overall["Accuracy"]
  ),
  
  Sensitivity_Recall = c(
    cm_baseline$byClass["Sensitivity"],
    cm_logit_50$byClass["Sensitivity"],
    cm_logit_15$byClass["Sensitivity"],
    cm_tree_50$byClass["Sensitivity"],
    cm_tree_15$byClass["Sensitivity"],
    cm_rf_50$byClass["Sensitivity"],
    cm_rf_15$byClass["Sensitivity"],
    cm_knn$byClass["Sensitivity"]
  ),
  
  Specificity = c(
    cm_baseline$byClass["Specificity"],
    cm_logit_50$byClass["Specificity"],
    cm_logit_15$byClass["Specificity"],
    cm_tree_50$byClass["Specificity"],
    cm_tree_15$byClass["Specificity"],
    cm_rf_50$byClass["Specificity"],
    cm_rf_15$byClass["Specificity"],
    cm_knn$byClass["Specificity"]
  ),
  
  Precision = c(
    NA,
    cm_logit_50$byClass["Pos Pred Value"],
    cm_logit_15$byClass["Pos Pred Value"],
    cm_tree_50$byClass["Pos Pred Value"],
    cm_tree_15$byClass["Pos Pred Value"],
    cm_rf_50$byClass["Pos Pred Value"],
    cm_rf_15$byClass["Pos Pred Value"],
    cm_knn$byClass["Pos Pred Value"]
  ),
  
  Balanced_Accuracy = c(
    cm_baseline$byClass["Balanced Accuracy"],
    cm_logit_50$byClass["Balanced Accuracy"],
    cm_logit_15$byClass["Balanced Accuracy"],
    cm_tree_50$byClass["Balanced Accuracy"],
    cm_tree_15$byClass["Balanced Accuracy"],
    cm_rf_50$byClass["Balanced Accuracy"],
    cm_rf_15$byClass["Balanced Accuracy"],
    cm_knn$byClass["Balanced Accuracy"]
  ),
  
  AUC = c(
    NA,
    as.numeric(logit_auc),
    as.numeric(logit_auc),
    as.numeric(tree_auc),
    as.numeric(tree_auc),
    as.numeric(rf_auc),
    as.numeric(rf_auc),
    as.numeric(knn_auc)
  )
) %>%
  mutate(across(where(is.numeric), ~ round(.x, 4)))

model_comparison
## # A tibble: 8 × 7
##   Model      Accuracy Sensitivity_Recall Specificity Precision Balanced_Accuracy
##   <chr>         <dbl>              <dbl>       <dbl>     <dbl>             <dbl>
## 1 Majority …    0.885             0            1        NA                 0.5  
## 2 Logistic …    0.885             0.012        0.998     0.488             0.505
## 3 Logistic …    0.784             0.352        0.840     0.222             0.596
## 4 Decision …    0.706             0.493        0.734     0.193             0.613
## 5 Decision …    0.115             1            0         0.115             0.5  
## 6 Random Fo…    0.886             0.015        0.998     0.562             0.507
## 7 Random Fo…    0.700             0.449        0.733     0.179             0.591
## 8 KNN Model     0.885             0.0132       0.998     0.478             0.506
## # ℹ 1 more variable: AUC <dbl>
write_csv(
  model_comparison,
  "model_comparison_metrics.csv"
)

ggplot(
  model_comparison %>%
    filter(Model != "Majority Class Baseline"),
  aes(
    x = reorder(Model, Sensitivity_Recall),
    y = Sensitivity_Recall
  )
) +
  geom_col(fill = "darkred") +
  coord_flip() +
  labs(
    title = "Model Comparison by Recall",
    x = "Model",
    y = "Recall / Sensitivity"
  ) +
  theme_minimal()

ggsave(
  "07_model_comparison_recall.png",
  width = 8,
  height = 6
)


############################################################
# 14. Predicted Risk Tiers
############################################################

test_data <- test_data %>%
  mutate(
    predicted_risk_tier = case_when(
      rf_prob >= 0.20 ~ "Very High Risk",
      rf_prob >= 0.15 ~ "High Risk",
      rf_prob >= 0.10 ~ "Moderate Risk",
      TRUE ~ "Low Risk"
    ),
    
    predicted_risk_tier = factor(
      predicted_risk_tier,
      levels = c(
        "Low Risk",
        "Moderate Risk",
        "High Risk",
        "Very High Risk"
      )
    )
  )

risk_tier_summary <- test_data %>%
  group_by(predicted_risk_tier) %>%
  summarise(
    encounters = n(),
    
    percent_of_test_set =
      round(n() / nrow(test_data) * 100, 2),
    
    actual_readmissions =
      sum(Readmit_30 == "Readmitted_30"),
    
    actual_readmission_rate =
      round(
        mean(Readmit_30 == "Readmitted_30") * 100,
        2
      ),
    
    avg_predicted_probability =
      round(mean(rf_prob) * 100, 2),
    
    avg_prior_inpatient =
      round(mean(number_inpatient), 2),
    
    avg_prior_emergency =
      round(mean(number_emergency), 2),
    
    avg_medications =
      round(mean(num_medications), 2),
    
    avg_diagnoses =
      round(mean(number_diagnoses), 2),
    
    avg_time_in_hospital =
      round(mean(time_in_hospital), 2),
    
    .groups = "drop"
  ) %>%
  arrange(desc(actual_readmission_rate))

risk_tier_summary
## # A tibble: 4 × 11
##   predicted_risk_tier encounters percent_of_test_set actual_readmissions
##   <fct>                    <int>               <dbl>               <int>
## 1 Very High Risk            4435                15.2                 941
## 2 High Risk                 3947                13.6                 556
## 3 Moderate Risk             6254                21.5                 722
## 4 Low Risk                 14495                49.8                1118
## # ℹ 7 more variables: actual_readmission_rate <dbl>,
## #   avg_predicted_probability <dbl>, avg_prior_inpatient <dbl>,
## #   avg_prior_emergency <dbl>, avg_medications <dbl>, avg_diagnoses <dbl>,
## #   avg_time_in_hospital <dbl>
write_csv(
  risk_tier_summary,
  "predicted_risk_tier_summary.csv"
)

ggplot(
  risk_tier_summary,
  aes(
    x = reorder(
      predicted_risk_tier,
      actual_readmission_rate
    ),
    y = actual_readmission_rate
  )
) +
  geom_col(fill = "purple") +
  
  geom_text(
    aes(
      label = paste0(actual_readmission_rate, "%")
    ),
    vjust = -0.4
  ) +
  
  labs(
    title =
      "Actual 30-Day Readmission Rate by Predicted Risk Tier",
    
    x = "Predicted Risk Tier",
    
    y = "Actual 30-Day Readmission Rate (%)"
  ) +
  
  theme_minimal()

ggsave(
  "08_actual_readmission_by_risk_tier.png",
  width = 8,
  height = 5
)

############################################################
# 15. Save Final Prediction File
############################################################

final_predictions <- test_data %>%
  
  mutate(
    knn_prob = knn_prob,
    knn_pred = knn_pred
  ) %>%
  
  select(
    
    # Actual Outcome
    Readmit_30,
    
    # Logistic Regression
    logit_prob,
    logit_pred_50,
    logit_pred_15,
    
    # Decision Tree
    tree_prob,
    tree_pred_50,
    tree_pred_15,
    
    # Random Forest
    rf_prob,
    rf_pred_50,
    rf_pred_15,
    
    # KNN
    knn_prob,
    knn_pred,
    
    # Risk Tier
    predicted_risk_tier,
    
    # Business Variables
    age,
    discharge_group,
    time_in_hospital,
    num_lab_procedures,
    num_procedures,
    num_medications,
    number_outpatient,
    number_emergency,
    number_inpatient,
    number_diagnoses,
    prior_utilization,
    insulin_change,
    medication_change
  )

head(final_predictions)
## # A tibble: 6 × 26
##   Readmit_30     logit_prob logit_pred_50  logit_pred_15  tree_prob tree_pred_50
##   <fct>               <dbl> <fct>          <fct>              <dbl> <fct>       
## 1 Not_Readmitted     0.0639 Not_Readmitted Not_Readmitted     0.354 Not_Readmit…
## 2 Not_Readmitted     0.116  Not_Readmitted Not_Readmitted     0.437 Not_Readmit…
## 3 Not_Readmitted     0.0874 Not_Readmitted Not_Readmitted     0.354 Not_Readmit…
## 4 Not_Readmitted     0.0777 Not_Readmitted Not_Readmitted     0.354 Not_Readmit…
## 5 Not_Readmitted     0.0777 Not_Readmitted Not_Readmitted     0.354 Not_Readmit…
## 6 Not_Readmitted     0.0847 Not_Readmitted Not_Readmitted     0.354 Not_Readmit…
## # ℹ 20 more variables: tree_pred_15 <fct>, rf_prob <dbl>, rf_pred_50 <fct>,
## #   rf_pred_15 <fct>, knn_prob <dbl>, knn_pred <fct>,
## #   predicted_risk_tier <fct>, age <fct>, discharge_group <fct>,
## #   time_in_hospital <dbl>, num_lab_procedures <dbl>, num_procedures <dbl>,
## #   num_medications <dbl>, number_outpatient <dbl>, number_emergency <dbl>,
## #   number_inpatient <dbl>, number_diagnoses <dbl>, prior_utilization <dbl>,
## #   insulin_change <fct>, medication_change <fct>
write_csv(
  final_predictions,
  "final_test_set_predictions.csv"
)

############################################################
# Additional Executive Summary Table
############################################################

prediction_summary <- final_predictions %>%
  summarise(
    
    total_patients = n(),
    
    actual_readmissions =
      sum(Readmit_30 == "Readmitted_30"),
    
    actual_readmission_rate =
      round(
        mean(Readmit_30 == "Readmitted_30") * 100,
        2
      ),
    
    avg_rf_probability =
      round(mean(rf_prob) * 100, 2),
    
    avg_logit_probability =
      round(mean(logit_prob) * 100, 2),
    
    avg_tree_probability =
      round(mean(tree_prob) * 100, 2),
    
    avg_knn_probability =
      round(mean(knn_prob) * 100, 2),
    
    rf_high_risk_flagged =
      sum(rf_pred_15 == "Readmitted_30"),
    
    logit_high_risk_flagged =
      sum(logit_pred_15 == "Readmitted_30"),
    
    tree_high_risk_flagged =
      sum(tree_pred_15 == "Readmitted_30"),
    
    knn_high_risk_flagged =
      sum(knn_pred == "Readmitted_30")
  )

prediction_summary
## # A tibble: 1 × 11
##   total_patients actual_readmissions actual_readmission_rate avg_rf_probability
##            <int>               <int>                   <dbl>              <dbl>
## 1          29131                3337                    11.5               11.8
## # ℹ 7 more variables: avg_logit_probability <dbl>, avg_tree_probability <dbl>,
## #   avg_knn_probability <dbl>, rf_high_risk_flagged <int>,
## #   logit_high_risk_flagged <int>, tree_high_risk_flagged <int>,
## #   knn_high_risk_flagged <int>
write_csv(
  prediction_summary,
  "prediction_summary.csv"
)

############################################################
# 16. Final Business Summary
############################################################

best_auc_model <- model_comparison %>%
  filter(!is.na(AUC)) %>%
  arrange(desc(AUC)) %>%
  slice(1)

best_recall_model <- model_comparison %>%
  arrange(desc(Sensitivity_Recall)) %>%
  slice(1)

best_balanced_model <- model_comparison %>%
  arrange(desc(Balanced_Accuracy)) %>%
  slice(1)

business_summary <- tibble(
  
  Item = c(
    "Baseline Accuracy",
    "Best AUC Model",
    "Best AUC",
    "Best Recall Model",
    "Best Recall",
    "Best Balanced Accuracy Model",
    "Best Balanced Accuracy",
    "Highest Risk Tier Readmission Rate",
    "Lowest Risk Tier Readmission Rate",
    "Operational Recommendation",
    "Primary Business Insight",
    "KNN Performance Insight"
  ),
  
  Value = c(
    
    # Baseline
    round(baseline_accuracy, 4),
    
    # AUC
    best_auc_model$Model,
    round(best_auc_model$AUC, 4),
    
    # Recall
    best_recall_model$Model,
    round(best_recall_model$Sensitivity_Recall, 4),
    
    # Balanced Accuracy
    best_balanced_model$Model,
    round(best_balanced_model$Balanced_Accuracy, 4),
    
    # Risk Tier Insights
    paste0(
      round(
        max(risk_tier_summary$actual_readmission_rate),
        2
      ),
      "%"
    ),
    
    paste0(
      round(
        min(risk_tier_summary$actual_readmission_rate),
        2
      ),
      "%"
    ),
    
    # Recommendations
    "Use lower probability thresholds such as 0.15 when hospitals prioritize identifying more high risk patients before discharge.",
    
    "Risk stratification successfully separated low and high readmission populations, with Very High Risk patients showing substantially elevated readmission rates.",
    
    "KNN produced weaker recall and balanced accuracy performance compared to Random Forest and Decision Tree models, limiting operational usefulness for intervention planning."
  )
)

business_summary
## # A tibble: 12 × 2
##    Item                               Value                                     
##    <chr>                              <chr>                                     
##  1 Baseline Accuracy                  0.8854                                    
##  2 Best AUC Model                     Logistic Regression - 0.50 Threshold      
##  3 Best AUC                           0.6604                                    
##  4 Best Recall Model                  Decision Tree - 0.15 Threshold            
##  5 Best Recall                        1                                         
##  6 Best Balanced Accuracy Model       Decision Tree - 0.50 Threshold            
##  7 Best Balanced Accuracy             0.6132                                    
##  8 Highest Risk Tier Readmission Rate 21.22%                                    
##  9 Lowest Risk Tier Readmission Rate  7.71%                                     
## 10 Operational Recommendation         Use lower probability thresholds such as …
## 11 Primary Business Insight           Risk stratification successfully separate…
## 12 KNN Performance Insight            KNN produced weaker recall and balanced a…
write_csv(
  business_summary,
  "business_summary_for_presentation.csv"
)

############################################################
# Executive Visualization
############################################################

ggplot(
  model_comparison %>%
    filter(Model != "Majority Class Baseline"),
  aes(
    x = reorder(Model, Balanced_Accuracy),
    y = Balanced_Accuracy,
    fill = Model
  )
) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  geom_text(
    aes(label = round(Balanced_Accuracy, 3)),
    hjust = -0.1
  ) +
  labs(
    title = "Balanced Accuracy Across Predictive Models",
    x = "Model",
    y = "Balanced Accuracy"
  ) +
  ylim(0, 1) +
  theme_minimal()

ggsave(
  "09_balanced_accuracy_comparison.png",
  width = 9,
  height = 6
)

############################################################
# High Risk Population Summary
############################################################

high_risk_summary <- risk_tier_summary %>%
  filter(predicted_risk_tier %in% c("High Risk", "Very High Risk")) %>%
  summarise(
    
    total_high_risk_patients =
      sum(encounters),
    
    percent_of_population =
      round(sum(percent_of_test_set), 2),
    
    combined_actual_readmissions =
      sum(actual_readmissions),
    
    combined_readmission_rate =
      round(
        weighted.mean(
          actual_readmission_rate,
          encounters
        ),
        2
      )
  )

high_risk_summary
## # A tibble: 1 × 4
##   total_high_risk_patients percent_of_population combined_actual_readmissions
##                      <int>                 <dbl>                        <int>
## 1                     8382                  28.8                         1497
## # ℹ 1 more variable: combined_readmission_rate <dbl>
write_csv(
  high_risk_summary,
  "high_risk_population_summary.csv"
)