The Whole R Script Running
############################################################
# 1. Load Data
############################################################
excel_file <- "diabetis_data.xlsx"
if (!file.exists(excel_file)) {
excel_file <- list.files(pattern = "diabet.*\\.xlsx$|diabetes.*\\.xlsx$", ignore.case = TRUE)[1]
}
diabetic_raw <- read_excel(excel_file)
head(diabetic_raw)
## # A tibble: 6 × 50
## encounter_id patient_nbr race gender age weight admission_type_id
## <dbl> <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 2278392 8222157 Caucasian Female [0-1… ? 6
## 2 149190 55629189 Caucasian Female [10-… ? 1
## 3 64410 86047875 AfricanAmerican Female [20-… ? 1
## 4 500364 82442376 Caucasian Male [30-… ? 1
## 5 16680 42519267 Caucasian Male [40-… ? 1
## 6 35754 82637451 Caucasian Male [50-… ? 2
## # ℹ 43 more variables: discharge_disposition_id <dbl>,
## # admission_source_id <dbl>, time_in_hospital <dbl>, payer_code <chr>,
## # medical_specialty <chr>, num_lab_procedures <dbl>, num_procedures <dbl>,
## # num_medications <dbl>, number_outpatient <dbl>, number_emergency <dbl>,
## # number_inpatient <dbl>, diag_1 <chr>, diag_2 <chr>, diag_3 <chr>,
## # number_diagnoses <dbl>, max_glu_serum <chr>, A1Cresult <chr>,
## # metformin <chr>, repaglinide <chr>, nateglinide <chr>, …
dim(diabetic_raw)
## [1] 101766 50
names(diabetic_raw)
## [1] "encounter_id" "patient_nbr"
## [3] "race" "gender"
## [5] "age" "weight"
## [7] "admission_type_id" "discharge_disposition_id"
## [9] "admission_source_id" "time_in_hospital"
## [11] "payer_code" "medical_specialty"
## [13] "num_lab_procedures" "num_procedures"
## [15] "num_medications" "number_outpatient"
## [17] "number_emergency" "number_inpatient"
## [19] "diag_1" "diag_2"
## [21] "diag_3" "number_diagnoses"
## [23] "max_glu_serum" "A1Cresult"
## [25] "metformin" "repaglinide"
## [27] "nateglinide" "chlorpropamide"
## [29] "glimepiride" "acetohexamide"
## [31] "glipizide" "glyburide"
## [33] "tolbutamide" "pioglitazone"
## [35] "rosiglitazone" "acarbose"
## [37] "miglitol" "troglitazone"
## [39] "tolazamide" "examide"
## [41] "citoglipton" "insulin"
## [43] "glyburide-metformin" "glipizide-metformin"
## [45] "glimepiride-pioglitazone" "metformin-rosiglitazone"
## [47] "metformin-pioglitazone" "change"
## [49] "diabetesMed" "readmitted"
str(diabetic_raw)
## tibble [101,766 × 50] (S3: tbl_df/tbl/data.frame)
## $ encounter_id : num [1:101766] 2278392 149190 64410 500364 16680 ...
## $ patient_nbr : num [1:101766] 8222157 55629189 86047875 82442376 42519267 ...
## $ race : chr [1:101766] "Caucasian" "Caucasian" "AfricanAmerican" "Caucasian" ...
## $ gender : chr [1:101766] "Female" "Female" "Female" "Male" ...
## $ age : chr [1:101766] "[0-10)" "[10-20)" "[20-30)" "[30-40)" ...
## $ weight : chr [1:101766] "?" "?" "?" "?" ...
## $ admission_type_id : num [1:101766] 6 1 1 1 1 2 3 1 2 3 ...
## $ discharge_disposition_id: num [1:101766] 25 1 1 1 1 1 1 1 1 3 ...
## $ admission_source_id : num [1:101766] 1 7 7 7 7 2 2 7 4 4 ...
## $ time_in_hospital : num [1:101766] 1 3 2 2 1 3 4 5 13 12 ...
## $ payer_code : chr [1:101766] "?" "?" "?" "?" ...
## $ medical_specialty : chr [1:101766] "Pediatrics-Endocrinology" "?" "?" "?" ...
## $ num_lab_procedures : num [1:101766] 41 59 11 44 51 31 70 73 68 33 ...
## $ num_procedures : num [1:101766] 0 0 5 1 0 6 1 0 2 3 ...
## $ num_medications : num [1:101766] 1 18 13 16 8 16 21 12 28 18 ...
## $ number_outpatient : num [1:101766] 0 0 2 0 0 0 0 0 0 0 ...
## $ number_emergency : num [1:101766] 0 0 0 0 0 0 0 0 0 0 ...
## $ number_inpatient : num [1:101766] 0 0 1 0 0 0 0 0 0 0 ...
## $ diag_1 : chr [1:101766] "250.83" "276" "648" "8" ...
## $ diag_2 : chr [1:101766] "?" "250.01" "250" "250.43" ...
## $ diag_3 : chr [1:101766] "?" "255" "V27" "403" ...
## $ number_diagnoses : num [1:101766] 1 9 6 7 5 9 7 8 8 8 ...
## $ max_glu_serum : chr [1:101766] "None" "None" "None" "None" ...
## $ A1Cresult : chr [1:101766] "None" "None" "None" "None" ...
## $ metformin : chr [1:101766] "No" "No" "No" "No" ...
## $ repaglinide : chr [1:101766] "No" "No" "No" "No" ...
## $ nateglinide : chr [1:101766] "No" "No" "No" "No" ...
## $ chlorpropamide : chr [1:101766] "No" "No" "No" "No" ...
## $ glimepiride : chr [1:101766] "No" "No" "No" "No" ...
## $ acetohexamide : chr [1:101766] "No" "No" "No" "No" ...
## $ glipizide : chr [1:101766] "No" "No" "Steady" "No" ...
## $ glyburide : chr [1:101766] "No" "No" "No" "No" ...
## $ tolbutamide : chr [1:101766] "No" "No" "No" "No" ...
## $ pioglitazone : chr [1:101766] "No" "No" "No" "No" ...
## $ rosiglitazone : chr [1:101766] "No" "No" "No" "No" ...
## $ acarbose : chr [1:101766] "No" "No" "No" "No" ...
## $ miglitol : chr [1:101766] "No" "No" "No" "No" ...
## $ troglitazone : chr [1:101766] "No" "No" "No" "No" ...
## $ tolazamide : chr [1:101766] "No" "No" "No" "No" ...
## $ examide : chr [1:101766] "No" "No" "No" "No" ...
## $ citoglipton : chr [1:101766] "No" "No" "No" "No" ...
## $ insulin : chr [1:101766] "No" "Up" "No" "Up" ...
## $ glyburide-metformin : chr [1:101766] "No" "No" "No" "No" ...
## $ glipizide-metformin : chr [1:101766] "No" "No" "No" "No" ...
## $ glimepiride-pioglitazone: chr [1:101766] "No" "No" "No" "No" ...
## $ metformin-rosiglitazone : chr [1:101766] "No" "No" "No" "No" ...
## $ metformin-pioglitazone : chr [1:101766] "No" "No" "No" "No" ...
## $ change : chr [1:101766] "No" "Ch" "No" "Ch" ...
## $ diabetesMed : chr [1:101766] "No" "Yes" "Yes" "Yes" ...
## $ readmitted : chr [1:101766] "NO" ">30" "NO" "NO" ...
############################################################
# 2. Lookup Tables
############################################################
admission_type <- tribble(
~admission_type_id, ~admission_type_desc,
1, "Emergency",
2, "Urgent",
3, "Elective",
4, "Newborn",
5, "Not Available",
6, "NULL",
7, "Trauma Center",
8, "Not Mapped"
)
discharge_disposition <- tribble(
~discharge_disposition_id, ~discharge_disposition_desc,
1, "Discharged to home",
2, "Discharged/transferred to another short term hospital",
3, "Discharged/transferred to SNF",
4, "Discharged/transferred to ICF",
5, "Discharged/transferred to another type of inpatient care institution",
6, "Discharged/transferred to home with home health service",
7, "Left AMA",
8, "Discharged/transferred to home under care of Home IV provider",
9, "Admitted as an inpatient to this hospital",
10, "Neonate discharged to another hospital for neonatal aftercare",
11, "Expired",
12, "Still patient or expected to return for outpatient services",
13, "Hospice / home",
14, "Hospice / medical facility",
15, "Discharged/transferred within this institution to Medicare approved swing bed",
16, "Discharged/transferred/referred another institution for outpatient services",
17, "Discharged/transferred/referred to this institution for outpatient services",
18, "NULL",
19, "Expired at home. Medicaid only, hospice.",
20, "Expired in a medical facility. Medicaid only, hospice.",
21, "Expired, place unknown. Medicaid only, hospice.",
22, "Discharged/transferred to another rehab fac including rehab units of a hospital",
23, "Discharged/transferred to a long term care hospital",
24, "Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare",
25, "Not Mapped",
26, "Unknown/Invalid",
27, "Discharged/transferred to a federal health care facility",
28, "Discharged/transferred/referred to a psychiatric hospital of psychiatric distinct part unit of a hospital",
29, "Discharged/transferred to a Critical Access Hospital (CAH)",
30, "Discharged/transferred to another Type of Health Care Institution not Defined Elsewhere"
)
admission_source <- tribble(
~admission_source_id, ~admission_source_desc,
1, "Physician Referral",
2, "Clinic Referral",
3, "HMO Referral",
4, "Transfer from a hospital",
5, "Transfer from a Skilled Nursing Facility (SNF)",
6, "Transfer from another health care facility",
7, "Emergency Room",
8, "Court/Law Enforcement",
9, "Not Available",
10, "Transfer from critical access hospital",
11, "Normal Delivery",
12, "Premature Delivery",
13, "Sick Baby",
14, "Extramural Birth",
15, "Not Available",
17, "NULL",
18, "Transfer From Another Home Health Agency",
19, "Readmission to Same Home Health Agency",
20, "Not Mapped",
21, "Unknown/Invalid",
22, "Transfer from hospital inpatient/same facility result in a separate claim",
23, "Born inside this hospital",
24, "Born outside this hospital",
25, "Transfer from Ambulatory Surgery Center",
26, "Transfer from Hospice"
)
############################################################
# 3. Data Preparation
############################################################
diabetic <- diabetic_raw %>%
mutate(across(where(is.character), ~ na_if(.x, "?"))) %>%
left_join(admission_type, by = "admission_type_id") %>%
left_join(discharge_disposition, by = "discharge_disposition_id") %>%
left_join(admission_source, by = "admission_source_id") %>%
mutate(
Readmit_30 = ifelse(readmitted == "<30", "Readmitted_30", "Not_Readmitted"),
Readmit_30 = factor(Readmit_30, levels = c("Not_Readmitted", "Readmitted_30")),
readmitted_30_flag = ifelse(readmitted == "<30", 1, 0),
admission_group = case_when(
admission_type_desc %in% c("Emergency", "Urgent") ~ "Emergency_Urgent",
admission_type_desc == "Elective" ~ "Elective",
TRUE ~ "Other"
),
discharge_group = case_when(
discharge_disposition_id == 1 ~ "Home",
discharge_disposition_id == 3 ~ "SNF",
discharge_disposition_id == 6 ~ "Home_Health",
discharge_disposition_id == 22 ~ "Rehab",
discharge_disposition_id %in% c(11, 13, 14, 19, 20, 21) ~ "Expired_Hospice",
TRUE ~ "Other"
),
prior_utilization = number_outpatient + number_emergency + number_inpatient,
prior_hospital_use = ifelse(number_inpatient > 0 | number_emergency > 0, "Yes", "No"),
insulin_change = ifelse(insulin %in% c("Up", "Down"), "Changed", "Not_Changed"),
medication_change = ifelse(change == "Ch", "Changed", "Not_Changed"),
high_diagnosis_count = ifelse(number_diagnoses >= median(number_diagnoses, na.rm = TRUE), "High", "Low"),
high_med_count = ifelse(num_medications >= median(num_medications, na.rm = TRUE), "High", "Low"),
age = as.factor(age),
race = as.factor(race),
gender = as.factor(gender),
admission_group = as.factor(admission_group),
discharge_group = as.factor(discharge_group),
admission_source_desc = as.factor(admission_source_desc),
A1Cresult = as.factor(A1Cresult),
insulin = as.factor(insulin),
diabetesMed = as.factor(diabetesMed),
medication_change = as.factor(medication_change),
insulin_change = as.factor(insulin_change),
prior_hospital_use = as.factor(prior_hospital_use),
high_diagnosis_count = as.factor(high_diagnosis_count),
high_med_count = as.factor(high_med_count)
)
############################################################
# 4. Target Review and Basic EDA
############################################################
table(diabetic$Readmit_30)
##
## Not_Readmitted Readmitted_30
## 90409 11357
prop.table(table(diabetic$Readmit_30))
##
## Not_Readmitted Readmitted_30
## 0.8884008 0.1115992
readmission_summary <- diabetic %>%
count(Readmit_30) %>%
mutate(percent = round(n / sum(n) * 100, 2))
readmission_summary
## # A tibble: 2 × 3
## Readmit_30 n percent
## <fct> <int> <dbl>
## 1 Not_Readmitted 90409 88.8
## 2 Readmitted_30 11357 11.2
write_csv(readmission_summary, "readmission_target_summary.csv")
ggplot(diabetic, aes(x = Readmit_30)) +
geom_bar(fill = "steelblue") +
labs(
title = "30-Day Readmission Target Distribution",
x = "Readmission Status",
y = "Number of Encounters"
) +
theme_minimal()

ggsave("01_target_distribution.png", width = 7, height = 5)
eda_summary <- diabetic %>%
group_by(Readmit_30) %>%
summarise(
n = n(),
avg_time_in_hospital = round(mean(time_in_hospital, na.rm = TRUE), 2),
avg_lab_procedures = round(mean(num_lab_procedures, na.rm = TRUE), 2),
avg_medications = round(mean(num_medications, na.rm = TRUE), 2),
avg_diagnoses = round(mean(number_diagnoses, na.rm = TRUE), 2),
avg_outpatient = round(mean(number_outpatient, na.rm = TRUE), 2),
avg_emergency = round(mean(number_emergency, na.rm = TRUE), 2),
avg_inpatient = round(mean(number_inpatient, na.rm = TRUE), 2),
.groups = "drop"
)
eda_summary
## # A tibble: 2 × 9
## Readmit_30 n avg_time_in_hospital avg_lab_procedures avg_medications
## <fct> <int> <dbl> <dbl> <dbl>
## 1 Not_Readmitted 90409 4.35 43.0 15.9
## 2 Readmitted_30 11357 4.77 44.2 16.9
## # ℹ 4 more variables: avg_diagnoses <dbl>, avg_outpatient <dbl>,
## # avg_emergency <dbl>, avg_inpatient <dbl>
write_csv(eda_summary, "eda_summary_by_readmission.csv")
ggplot(diabetic, aes(x = Readmit_30, y = number_inpatient)) +
geom_boxplot() +
labs(
title = "Prior Inpatient Visits by Readmission Status",
x = "Readmission Status",
y = "Prior Inpatient Visits"
) +
theme_minimal()

ggsave("02_prior_inpatient_by_readmission.png", width = 7, height = 5)
ggplot(diabetic, aes(x = insulin_change, fill = Readmit_30)) +
geom_bar(position = "fill") +
labs(
title = "Readmission Status by Insulin Change",
x = "Insulin Change",
y = "Proportion",
fill = "Readmission Status"
) +
theme_minimal()

ggsave("03_insulin_change_readmission.png", width = 7, height = 5)
############################################################
# 5. Model-Ready Dataset
############################################################
model_data <- diabetic %>%
filter(discharge_group != "Expired_Hospice") %>%
select(
Readmit_30,
age,
race,
admission_group,
discharge_group,
admission_source_desc,
time_in_hospital,
num_lab_procedures,
num_procedures,
num_medications,
number_outpatient,
number_emergency,
number_inpatient,
prior_utilization,
prior_hospital_use,
number_diagnoses,
A1Cresult,
insulin,
insulin_change,
medication_change,
diabetesMed,
high_diagnosis_count,
high_med_count
) %>%
drop_na()
head(model_data)
## # A tibble: 6 × 23
## Readmit_30 age race admission_group discharge_group admission_source_desc
## <fct> <fct> <fct> <fct> <fct> <fct>
## 1 Not_Readmit… [0-1… Cauc… Other Other Physician Referral
## 2 Not_Readmit… [10-… Cauc… Emergency_Urge… Home Emergency Room
## 3 Not_Readmit… [20-… Afri… Emergency_Urge… Home Emergency Room
## 4 Not_Readmit… [30-… Cauc… Emergency_Urge… Home Emergency Room
## 5 Not_Readmit… [40-… Cauc… Emergency_Urge… Home Emergency Room
## 6 Not_Readmit… [50-… Cauc… Emergency_Urge… Home Clinic Referral
## # ℹ 17 more variables: time_in_hospital <dbl>, num_lab_procedures <dbl>,
## # num_procedures <dbl>, num_medications <dbl>, number_outpatient <dbl>,
## # number_emergency <dbl>, number_inpatient <dbl>, prior_utilization <dbl>,
## # prior_hospital_use <fct>, number_diagnoses <dbl>, A1Cresult <fct>,
## # insulin <fct>, insulin_change <fct>, medication_change <fct>,
## # diabetesMed <fct>, high_diagnosis_count <fct>, high_med_count <fct>
dim(model_data)
## [1] 97109 23
table(model_data$Readmit_30)
##
## Not_Readmitted Readmitted_30
## 85983 11126
prop.table(table(model_data$Readmit_30))
##
## Not_Readmitted Readmitted_30
## 0.8854277 0.1145723
write_csv(model_data, "diabetes_model_ready.csv")
############################################################
# 6. Train/Test Split
############################################################
model_data$Readmit_30 <- factor(
as.character(model_data$Readmit_30),
levels = c("Not_Readmitted", "Readmitted_30")
)
model_data <- model_data %>%
mutate(
across(
.cols = c(
age,
race,
admission_group,
discharge_group,
A1Cresult,
insulin,
insulin_change,
medication_change,
diabetesMed,
high_diagnosis_count,
high_med_count
),
.fns = ~ fct_lump_min(., min = 20, other_level = "Other")
)
)
model_data$Readmit_30 <- droplevels(model_data$Readmit_30)
train_index <- createDataPartition(
model_data$Readmit_30,
p = 0.70,
list = FALSE
)
train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]
factor_vars <- setdiff(
names(train_data)[sapply(train_data, is.factor)],
"Readmit_30"
)
for (v in factor_vars) {
train_data[[v]] <- fct_expand(train_data[[v]], "Other")
test_data[[v]] <- fct_other(
test_data[[v]],
keep = levels(train_data[[v]])[
levels(train_data[[v]]) != "Other"
],
other_level = "Other"
)
test_data[[v]] <- factor(
test_data[[v]],
levels = levels(train_data[[v]])
)
}
prop.table(table(train_data$Readmit_30))
##
## Not_Readmitted Readmitted_30
## 0.8854188 0.1145812
prop.table(table(test_data$Readmit_30))
##
## Not_Readmitted Readmitted_30
## 0.8854485 0.1145515
############################################################
# 7. Baseline Accuracy
############################################################
majority_class <- names(which.max(table(train_data$Readmit_30)))
baseline_accuracy <- mean(test_data$Readmit_30 == majority_class)
baseline_accuracy
## [1] 0.8854485
############################################################
# 8. Logistic Regression Model
############################################################
logit_model <- glm(
Readmit_30 ~
age + race + admission_group + discharge_group +
time_in_hospital + num_lab_procedures + num_procedures +
num_medications + number_outpatient + number_emergency +
number_inpatient + prior_utilization + prior_hospital_use +
number_diagnoses + A1Cresult + insulin + insulin_change +
medication_change + diabetesMed + high_diagnosis_count + high_med_count,
data = train_data,
family = binomial
)
summary(logit_model)
##
## Call:
## glm(formula = Readmit_30 ~ age + race + admission_group + discharge_group +
## time_in_hospital + num_lab_procedures + num_procedures +
## num_medications + number_outpatient + number_emergency +
## number_inpatient + prior_utilization + prior_hospital_use +
## number_diagnoses + A1Cresult + insulin + insulin_change +
## medication_change + diabetesMed + high_diagnosis_count +
## high_med_count, family = binomial, data = train_data)
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.5219144 0.7279291 -6.212 5.23e-10 ***
## age[10-20) 0.9573203 0.7391770 1.295 0.195280
## age[20-30) 1.5559482 0.7207939 2.159 0.030877 *
## age[30-40) 1.3930218 0.7181915 1.940 0.052425 .
## age[40-50) 1.3472954 0.7167209 1.880 0.060135 .
## age[50-60) 1.2519629 0.7164167 1.748 0.080545 .
## age[60-70) 1.3578277 0.7162935 1.896 0.058009 .
## age[70-80) 1.3949600 0.7162728 1.948 0.051472 .
## age[80-90) 1.3800691 0.7165438 1.926 0.054103 .
## age[90-100) 1.3162757 0.7198196 1.829 0.067457 .
## raceAsian 0.2592958 0.1492476 1.737 0.082325 .
## raceCaucasian -0.0039834 0.0321969 -0.124 0.901536
## raceHispanic -0.0305772 0.0926311 -0.330 0.741327
## raceOther -0.1235822 0.1111361 -1.112 0.266143
## admission_groupEmergency_Urgent 0.0580347 0.0351588 1.651 0.098811 .
## admission_groupOther 0.0264541 0.0499516 0.530 0.596394
## discharge_groupHome_Health 0.1867381 0.0383211 4.873 1.10e-06 ***
## discharge_groupRehab 1.3345045 0.0644987 20.690 < 2e-16 ***
## discharge_groupSNF 0.3317891 0.0380310 8.724 < 2e-16 ***
## discharge_groupOther 0.4527593 0.0395776 11.440 < 2e-16 ***
## time_in_hospital 0.0019630 0.0048158 0.408 0.683555
## num_lab_procedures 0.0018166 0.0007193 2.526 0.011550 *
## num_procedures -0.0125493 0.0084029 -1.493 0.135319
## num_medications 0.0017449 0.0025130 0.694 0.487458
## number_outpatient -0.0109355 0.0093223 -1.173 0.240776
## number_emergency 0.0218240 0.0103247 2.114 0.034534 *
## number_inpatient 0.2230864 0.0096190 23.192 < 2e-16 ***
## prior_utilization NA NA NA NA
## prior_hospital_useYes 0.2127560 0.0315337 6.747 1.51e-11 ***
## number_diagnoses 0.0580651 0.0148632 3.907 9.36e-05 ***
## A1Cresult>8 0.0755534 0.0810859 0.932 0.351456
## A1CresultNone 0.1359545 0.0688742 1.974 0.048387 *
## A1CresultNorm -0.0017525 0.0892279 -0.020 0.984330
## insulinNo -0.2923995 0.0483304 -6.050 1.45e-09 ***
## insulinSteady -0.2216333 0.0440672 -5.029 4.92e-07 ***
## insulinUp -0.1269756 0.0474863 -2.674 0.007497 **
## insulin_changeNot_Changed NA NA NA NA
## medication_changeNot_Changed 0.1391060 0.0350277 3.971 7.15e-05 ***
## diabetesMedYes 0.1504966 0.0398320 3.778 0.000158 ***
## high_diagnosis_countLow 0.0634175 0.0547461 1.158 0.246703
## high_med_countLow -0.0681849 0.0362036 -1.883 0.059650 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 48399 on 67977 degrees of freedom
## Residual deviance: 46125 on 67939 degrees of freedom
## AIC: 46203
##
## Number of Fisher Scoring iterations: 6
test_data <- test_data %>%
mutate(
logit_prob = predict(logit_model, newdata = test_data, type = "response"),
logit_pred_50 = ifelse(logit_prob >= 0.50, "Readmitted_30", "Not_Readmitted"),
logit_pred_50 = factor(logit_pred_50, levels = levels(Readmit_30)),
logit_pred_15 = ifelse(logit_prob >= 0.15, "Readmitted_30", "Not_Readmitted"),
logit_pred_15 = factor(logit_pred_15, levels = levels(Readmit_30))
)
logit_accuracy_50 <- mean(test_data$logit_pred_50 == test_data$Readmit_30)
logit_accuracy_15 <- mean(test_data$logit_pred_15 == test_data$Readmit_30)
logit_accuracy_50
## [1] 0.8853798
logit_accuracy_15
## [1] 0.7843191
confusionMatrix(
data = test_data$logit_pred_50,
reference = test_data$Readmit_30,
positive = "Readmitted_30"
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Not_Readmitted Readmitted_30
## Not_Readmitted 25752 3297
## Readmitted_30 42 40
##
## Accuracy : 0.8854
## 95% CI : (0.8817, 0.889)
## No Information Rate : 0.8854
## P-Value [Acc > NIR] : 0.5193
##
## Kappa : 0.018
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.011987
## Specificity : 0.998372
## Pos Pred Value : 0.487805
## Neg Pred Value : 0.886502
## Prevalence : 0.114552
## Detection Rate : 0.001373
## Detection Prevalence : 0.002815
## Balanced Accuracy : 0.505179
##
## 'Positive' Class : Readmitted_30
##
confusionMatrix(
data = test_data$logit_pred_15,
reference = test_data$Readmit_30,
positive = "Readmitted_30"
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Not_Readmitted Readmitted_30
## Not_Readmitted 21672 2161
## Readmitted_30 4122 1176
##
## Accuracy : 0.7843
## 95% CI : (0.7796, 0.789)
## No Information Rate : 0.8854
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1534
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.35241
## Specificity : 0.84020
## Pos Pred Value : 0.22197
## Neg Pred Value : 0.90933
## Prevalence : 0.11455
## Detection Rate : 0.04037
## Detection Prevalence : 0.18187
## Balanced Accuracy : 0.59630
##
## 'Positive' Class : Readmitted_30
##
logit_roc <- roc(
response = test_data$Readmit_30,
predictor = test_data$logit_prob,
levels = c("Not_Readmitted", "Readmitted_30")
)
## Setting direction: controls < cases
logit_auc <- auc(logit_roc)
logit_auc
## Area under the curve: 0.6604
############################################################
# 9. Decision Tree Model
############################################################
tree_model <- rpart(
Readmit_30 ~
age + race + admission_group + discharge_group +
time_in_hospital + num_lab_procedures + num_procedures +
num_medications + number_outpatient + number_emergency +
number_inpatient + prior_hospital_use +
number_diagnoses + A1Cresult + insulin +
medication_change + diabetesMed + high_diagnosis_count + high_med_count,
data = train_data,
method = "class",
parms = list(prior = c(0.50, 0.50)),
control = rpart.control(cp = 0.0001, minsplit = 50, maxdepth = 5)
)
print(tree_model)
## n= 67978
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 67978 33989.0000 Readmitted_30 (0.5000000 0.5000000)
## 2) number_inpatient< 1.5 58199 24655.0100 Not_Readmitted (0.5461962 0.4538038)
## 4) discharge_group=Home 36185 12196.5900 Not_Readmitted (0.6072214 0.3927786)
## 8) prior_hospital_use=No 26831 7758.6910 Not_Readmitted (0.6458227 0.3541773) *
## 9) prior_hospital_use=Yes 9354 4437.9010 Not_Readmitted (0.5147630 0.4852370)
## 18) number_diagnoses< 7.5 3842 1531.6650 Not_Readmitted (0.5627623 0.4372377) *
## 19) number_diagnoses>=7.5 5512 2736.5580 Readmitted_30 (0.4849651 0.5150349)
## 38) num_medications< 17.5 3375 1623.3030 Not_Readmitted (0.5109223 0.4890777) *
## 39) num_medications>=17.5 2137 1040.7500 Readmitted_30 (0.4478882 0.5521118) *
## 5) discharge_group=Home_Health,Rehab,SNF,Other 22014 10819.1700 Readmitted_30 (0.4647893 0.5352107)
## 10) discharge_group=Home_Health,SNF,Other 20814 10325.6200 Readmitted_30 (0.4833761 0.5166239)
## 20) prior_hospital_use=No 14189 6859.7650 Not_Readmitted (0.5094787 0.4905213) *
## 21) prior_hospital_use=Yes 6625 3200.7450 Readmitted_30 (0.4338921 0.5661079) *
## 11) discharge_group=Rehab 1200 493.5517 Readmitted_30 (0.2575782 0.7424218) *
## 3) number_inpatient>=1.5 9779 4314.3420 Readmitted_30 (0.3161076 0.6838924) *
printcp(tree_model)
##
## Classification tree:
## rpart(formula = Readmit_30 ~ age + race + admission_group + discharge_group +
## time_in_hospital + num_lab_procedures + num_procedures +
## num_medications + number_outpatient + number_emergency +
## number_inpatient + prior_hospital_use + number_diagnoses +
## A1Cresult + insulin + medication_change + diabetesMed + high_diagnosis_count +
## high_med_count, data = train_data, method = "class", parms = list(prior = c(0.5,
## 0.5)), control = rpart.control(cp = 1e-04, minsplit = 50,
## maxdepth = 5))
##
## Variables actually used in tree construction:
## [1] discharge_group num_medications number_diagnoses number_inpatient
## [5] prior_hospital_use
##
## Root node error: 33989/67978 = 0.5
##
## n= 67978
##
## CP nsplit rel error xerror xstd
## 1 0.1476846 0 1.00000 1.01659 0.0069309
## 2 0.0482286 1 0.85232 0.85232 0.0091952
## 3 0.0039000 2 0.80409 0.80541 0.0069028
## 4 0.0024961 4 0.79629 0.80438 0.0071865
## 5 0.0021332 6 0.79129 0.79705 0.0074642
## 6 0.0001000 7 0.78916 0.79663 0.0073978
png("04_decision_tree.png", width = 1200, height = 800)
plot(tree_model, uniform = TRUE, margin = 0.1)
text(tree_model, use.n = TRUE, cex = 0.7)
dev.off()
## quartz_off_screen
## 2
tree_prob_matrix <- predict(tree_model, newdata = test_data, type = "prob")
tree_prob <- as.numeric(tree_prob_matrix[, "Readmitted_30"])
test_data <- test_data %>%
mutate(
tree_prob = tree_prob,
tree_pred_50 = ifelse(tree_prob >= 0.50, "Readmitted_30", "Not_Readmitted"),
tree_pred_50 = factor(tree_pred_50, levels = levels(Readmit_30)),
tree_pred_15 = ifelse(tree_prob >= 0.15, "Readmitted_30", "Not_Readmitted"),
tree_pred_15 = factor(tree_pred_15, levels = levels(Readmit_30))
)
tree_accuracy_50 <- mean(test_data$tree_pred_50 == test_data$Readmit_30)
tree_accuracy_15 <- mean(test_data$tree_pred_15 == test_data$Readmit_30)
tree_accuracy_50
## [1] 0.7059147
tree_accuracy_15
## [1] 0.1145515
confusionMatrix(
data = test_data$tree_pred_50,
reference = test_data$Readmit_30,
positive = "Readmitted_30"
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Not_Readmitted Readmitted_30
## Not_Readmitted 18919 1692
## Readmitted_30 6875 1645
##
## Accuracy : 0.7059
## 95% CI : (0.7006, 0.7111)
## No Information Rate : 0.8854
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1351
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.49296
## Specificity : 0.73347
## Pos Pred Value : 0.19308
## Neg Pred Value : 0.91791
## Prevalence : 0.11455
## Detection Rate : 0.05647
## Detection Prevalence : 0.29247
## Balanced Accuracy : 0.61321
##
## 'Positive' Class : Readmitted_30
##
confusionMatrix(
data = test_data$tree_pred_15,
reference = test_data$Readmit_30,
positive = "Readmitted_30"
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Not_Readmitted Readmitted_30
## Not_Readmitted 0 0
## Readmitted_30 25794 3337
##
## Accuracy : 0.1146
## 95% CI : (0.1109, 0.1183)
## No Information Rate : 0.8854
## P-Value [Acc > NIR] : 1
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.1146
## Neg Pred Value : NaN
## Prevalence : 0.1146
## Detection Rate : 0.1146
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Readmitted_30
##
tree_roc <- roc(
response = test_data$Readmit_30,
predictor = test_data$tree_prob,
levels = c("Not_Readmitted", "Readmitted_30")
)
## Setting direction: controls < cases
tree_auc <- auc(tree_roc)
tree_auc
## Area under the curve: 0.6494
############################################################
# 10. Random Forest Model
############################################################
rf_train_data <- train_data
if (nrow(rf_train_data) > 25000) {
rf_index <- createDataPartition(
rf_train_data$Readmit_30,
p = 25000 / nrow(rf_train_data),
list = FALSE
)
rf_train_data <- rf_train_data[rf_index, ]
}
rf_model <- randomForest(
Readmit_30 ~
age + race + admission_group + discharge_group +
time_in_hospital + num_lab_procedures + num_procedures +
num_medications + number_outpatient + number_emergency +
number_inpatient + prior_utilization + prior_hospital_use +
number_diagnoses + A1Cresult + insulin + insulin_change +
medication_change + diabetesMed + high_diagnosis_count + high_med_count,
data = rf_train_data,
ntree = 300,
importance = TRUE
)
rf_model
##
## Call:
## randomForest(formula = Readmit_30 ~ age + race + admission_group + discharge_group + time_in_hospital + num_lab_procedures + num_procedures + num_medications + number_outpatient + number_emergency + number_inpatient + prior_utilization + prior_hospital_use + number_diagnoses + A1Cresult + insulin + insulin_change + medication_change + diabetesMed + high_diagnosis_count + high_med_count, data = rf_train_data, ntree = 300, importance = TRUE)
## Type of random forest: classification
## Number of trees: 300
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 11.49%
## Confusion matrix:
## Not_Readmitted Readmitted_30 class.error
## Not_Readmitted 22086 50 0.002258764
## Readmitted_30 2823 42 0.985340314
rf_prob_matrix <- predict(rf_model, newdata = test_data, type = "prob")
rf_prob <- as.numeric(rf_prob_matrix[, "Readmitted_30"])
test_data <- test_data %>%
mutate(
rf_prob = rf_prob,
rf_pred_50 = ifelse(rf_prob >= 0.50, "Readmitted_30", "Not_Readmitted"),
rf_pred_50 = factor(rf_pred_50, levels = levels(Readmit_30)),
rf_pred_15 = ifelse(rf_prob >= 0.15, "Readmitted_30", "Not_Readmitted"),
rf_pred_15 = factor(rf_pred_15, levels = levels(Readmit_30))
)
rf_accuracy_50 <- mean(test_data$rf_pred_50 == test_data$Readmit_30)
rf_accuracy_15 <- mean(test_data$rf_pred_15 == test_data$Readmit_30)
rf_accuracy_50
## [1] 0.8858261
rf_accuracy_15
## [1] 0.7004909
confusionMatrix(
data = test_data$rf_pred_50,
reference = test_data$Readmit_30,
positive = "Readmitted_30"
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Not_Readmitted Readmitted_30
## Not_Readmitted 25755 3287
## Readmitted_30 39 50
##
## Accuracy : 0.8858
## 95% CI : (0.8821, 0.8895)
## No Information Rate : 0.8854
## P-Value [Acc > NIR] : 0.4243
##
## Kappa : 0.0234
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.014984
## Specificity : 0.998488
## Pos Pred Value : 0.561798
## Neg Pred Value : 0.886819
## Prevalence : 0.114552
## Detection Rate : 0.001716
## Detection Prevalence : 0.003055
## Balanced Accuracy : 0.506736
##
## 'Positive' Class : Readmitted_30
##
confusionMatrix(
data = test_data$rf_pred_15,
reference = test_data$Readmit_30,
positive = "Readmitted_30"
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Not_Readmitted Readmitted_30
## Not_Readmitted 18909 1840
## Readmitted_30 6885 1497
##
## Accuracy : 0.7005
## 95% CI : (0.6952, 0.7057)
## No Information Rate : 0.8854
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1096
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.44861
## Specificity : 0.73308
## Pos Pred Value : 0.17860
## Neg Pred Value : 0.91132
## Prevalence : 0.11455
## Detection Rate : 0.05139
## Detection Prevalence : 0.28773
## Balanced Accuracy : 0.59084
##
## 'Positive' Class : Readmitted_30
##
rf_roc <- roc(
response = test_data$Readmit_30,
predictor = test_data$rf_prob,
levels = c("Not_Readmitted", "Readmitted_30")
)
## Setting direction: controls < cases
rf_auc <- auc(rf_roc)
rf_auc
## Area under the curve: 0.6332
importance_table <- as.data.frame(importance(rf_model)) %>%
rownames_to_column("Variable") %>%
arrange(desc(MeanDecreaseGini))
importance_table
## Variable Not_Readmitted Readmitted_30 MeanDecreaseAccuracy
## 1 num_lab_procedures 23.285749 -7.0026099 19.804641
## 2 num_medications 28.244949 -10.8445947 24.209542
## 3 time_in_hospital 27.022479 -7.0203465 24.892091
## 4 age 17.456273 -5.5285031 14.850999
## 5 num_procedures 17.627735 -4.3878906 15.147771
## 6 discharge_group 25.691344 10.5893495 28.604706
## 7 number_diagnoses 19.558166 -5.7847059 17.374685
## 8 prior_utilization 10.354183 8.4006557 13.519617
## 9 insulin 7.864688 -1.1703806 7.323258
## 10 number_inpatient 5.874477 17.8806779 10.472016
## 11 race 5.660397 -1.9601419 4.677275
## 12 admission_group 17.155422 -5.2218277 14.743514
## 13 A1Cresult 4.338330 2.2907170 4.976087
## 14 number_outpatient 17.826535 -4.9358317 16.584679
## 15 number_emergency 9.496486 -2.3185172 8.735020
## 16 medication_change 8.737793 -3.3757848 7.351964
## 17 diabetesMed 2.198277 -0.5577732 1.922025
## 18 high_med_count 15.889228 -7.5725669 14.927081
## 19 high_diagnosis_count 13.508832 -7.6961569 12.024778
## 20 insulin_change 2.982510 0.8371765 3.107998
## 21 prior_hospital_use 2.704876 9.0583866 6.058677
## MeanDecreaseGini
## 1 715.74316
## 2 575.54567
## 3 425.95849
## 4 404.80839
## 5 296.56759
## 6 254.38006
## 7 228.86677
## 8 188.87266
## 9 187.35124
## 10 171.52202
## 11 151.91194
## 12 141.11304
## 13 131.59531
## 14 116.85064
## 15 87.11908
## 16 86.44838
## 17 64.80766
## 18 60.63315
## 19 49.36050
## 20 47.79434
## 21 34.55936
write_csv(importance_table, "random_forest_variable_importance.csv")
ggplot(importance_table %>% slice_max(MeanDecreaseGini, n = 12),
aes(x = reorder(Variable, MeanDecreaseGini), y = MeanDecreaseGini)) +
geom_col(fill = "darkgreen") +
coord_flip() +
labs(
title = "Top Random Forest Predictors",
x = "Predictor",
y = "Importance"
) +
theme_minimal()

ggsave("05_random_forest_variable_importance.png", width = 8, height = 6)
############################################################
# 11. kNN Model
############################################################
knn_data <- model_data %>%
select(
Readmit_30,
time_in_hospital,
num_lab_procedures,
num_procedures,
num_medications,
number_outpatient,
number_emergency,
number_inpatient,
prior_utilization,
number_diagnoses
)
knn_train <- knn_data[train_index, ]
knn_test <- knn_data[-train_index, ]
train_control <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE
)
knn_grid <- expand.grid(k = seq(3, 21, by = 2))
knn_model <- train(
Readmit_30 ~ .,
data = knn_train,
method = "knn",
trControl = train_control,
preProcess = c("center", "scale"),
tuneGrid = knn_grid
)
knn_model
## k-Nearest Neighbors
##
## 67978 samples
## 9 predictor
## 2 classes: 'Not_Readmitted', 'Readmitted_30'
##
## Pre-processing: centered (9), scaled (9)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 54382, 54382, 54383, 54383, 54382
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 3 0.8592486 0.04221908
## 5 0.8749448 0.03787087
## 7 0.8801230 0.03086136
## 9 0.8826679 0.02728957
## 11 0.8840360 0.02590649
## 13 0.8843744 0.02119723
## 15 0.8846097 0.01961922
## 17 0.8850216 0.02141458
## 19 0.8851687 0.01925817
## 21 0.8853011 0.01858526
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 21.
knn_model$bestTune
## k
## 10 21
knn_pred <- predict(knn_model, newdata = knn_test)
knn_prob <- predict(knn_model, newdata = knn_test, type = "prob")[, "Readmitted_30"]
cm_knn <- confusionMatrix(
data = knn_pred,
reference = knn_test$Readmit_30,
positive = "Readmitted_30"
)
cm_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction Not_Readmitted Readmitted_30
## Not_Readmitted 25746 3293
## Readmitted_30 48 44
##
## Accuracy : 0.8853
## 95% CI : (0.8816, 0.8889)
## No Information Rate : 0.8854
## P-Value [Acc > NIR] : 0.5339
##
## Kappa : 0.0196
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.013185
## Specificity : 0.998139
## Pos Pred Value : 0.478261
## Neg Pred Value : 0.886601
## Prevalence : 0.114552
## Detection Rate : 0.001510
## Detection Prevalence : 0.003158
## Balanced Accuracy : 0.505662
##
## 'Positive' Class : Readmitted_30
##
knn_roc <- roc(
response = knn_test$Readmit_30,
predictor = knn_prob,
levels = c("Not_Readmitted", "Readmitted_30")
)
## Setting direction: controls < cases
knn_auc <- auc(knn_roc)
knn_auc
## Area under the curve: 0.5915
############################################################
# 12. ROC Curve Comparison
############################################################
png("06_roc_curve_comparison.png", width = 900, height = 700)
plot(
logit_roc,
col = "blue",
lwd = 3,
main = "ROC Curve Comparison Across Models",
legacy.axes = TRUE
)
plot(
tree_roc,
col = "red",
lwd = 3,
add = TRUE
)
plot(
rf_roc,
col = "darkgreen",
lwd = 3,
add = TRUE
)
plot(
knn_roc,
col = "purple",
lwd = 3,
add = TRUE
)
abline(a = 0, b = 1, lty = 2, col = "gray50")
legend(
"bottomright",
legend = c(
paste("Logistic Regression AUC =", round(logit_auc, 3)),
paste("Decision Tree AUC =", round(tree_auc, 3)),
paste("Random Forest AUC =", round(rf_auc, 3)),
paste("KNN AUC =", round(knn_auc, 3))
),
col = c("blue", "red", "darkgreen", "purple"),
lwd = 3,
cex = 0.9
)
dev.off()
## quartz_off_screen
## 2
############################################################
# 13. Model Comparison Tables
############################################################
baseline_pred <- factor(
rep("Not_Readmitted", nrow(test_data)),
levels = levels(test_data$Readmit_30)
)
cm_baseline <- confusionMatrix(
data = baseline_pred,
reference = test_data$Readmit_30,
positive = "Readmitted_30"
)
cm_logit_50 <- confusionMatrix(
test_data$logit_pred_50,
test_data$Readmit_30,
positive = "Readmitted_30"
)
cm_logit_15 <- confusionMatrix(
test_data$logit_pred_15,
test_data$Readmit_30,
positive = "Readmitted_30"
)
cm_tree_50 <- confusionMatrix(
test_data$tree_pred_50,
test_data$Readmit_30,
positive = "Readmitted_30"
)
cm_tree_15 <- confusionMatrix(
test_data$tree_pred_15,
test_data$Readmit_30,
positive = "Readmitted_30"
)
cm_rf_50 <- confusionMatrix(
test_data$rf_pred_50,
test_data$Readmit_30,
positive = "Readmitted_30"
)
cm_rf_15 <- confusionMatrix(
test_data$rf_pred_15,
test_data$Readmit_30,
positive = "Readmitted_30"
)
model_comparison <- tibble(
Model = c(
"Majority Class Baseline",
"Logistic Regression - 0.50 Threshold",
"Logistic Regression - 0.15 Threshold",
"Decision Tree - 0.50 Threshold",
"Decision Tree - 0.15 Threshold",
"Random Forest - 0.50 Threshold",
"Random Forest - 0.15 Threshold",
"KNN Model"
),
Accuracy = c(
cm_baseline$overall["Accuracy"],
cm_logit_50$overall["Accuracy"],
cm_logit_15$overall["Accuracy"],
cm_tree_50$overall["Accuracy"],
cm_tree_15$overall["Accuracy"],
cm_rf_50$overall["Accuracy"],
cm_rf_15$overall["Accuracy"],
cm_knn$overall["Accuracy"]
),
Sensitivity_Recall = c(
cm_baseline$byClass["Sensitivity"],
cm_logit_50$byClass["Sensitivity"],
cm_logit_15$byClass["Sensitivity"],
cm_tree_50$byClass["Sensitivity"],
cm_tree_15$byClass["Sensitivity"],
cm_rf_50$byClass["Sensitivity"],
cm_rf_15$byClass["Sensitivity"],
cm_knn$byClass["Sensitivity"]
),
Specificity = c(
cm_baseline$byClass["Specificity"],
cm_logit_50$byClass["Specificity"],
cm_logit_15$byClass["Specificity"],
cm_tree_50$byClass["Specificity"],
cm_tree_15$byClass["Specificity"],
cm_rf_50$byClass["Specificity"],
cm_rf_15$byClass["Specificity"],
cm_knn$byClass["Specificity"]
),
Precision = c(
NA,
cm_logit_50$byClass["Pos Pred Value"],
cm_logit_15$byClass["Pos Pred Value"],
cm_tree_50$byClass["Pos Pred Value"],
cm_tree_15$byClass["Pos Pred Value"],
cm_rf_50$byClass["Pos Pred Value"],
cm_rf_15$byClass["Pos Pred Value"],
cm_knn$byClass["Pos Pred Value"]
),
Balanced_Accuracy = c(
cm_baseline$byClass["Balanced Accuracy"],
cm_logit_50$byClass["Balanced Accuracy"],
cm_logit_15$byClass["Balanced Accuracy"],
cm_tree_50$byClass["Balanced Accuracy"],
cm_tree_15$byClass["Balanced Accuracy"],
cm_rf_50$byClass["Balanced Accuracy"],
cm_rf_15$byClass["Balanced Accuracy"],
cm_knn$byClass["Balanced Accuracy"]
),
AUC = c(
NA,
as.numeric(logit_auc),
as.numeric(logit_auc),
as.numeric(tree_auc),
as.numeric(tree_auc),
as.numeric(rf_auc),
as.numeric(rf_auc),
as.numeric(knn_auc)
)
) %>%
mutate(across(where(is.numeric), ~ round(.x, 4)))
model_comparison
## # A tibble: 8 × 7
## Model Accuracy Sensitivity_Recall Specificity Precision Balanced_Accuracy
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Majority … 0.885 0 1 NA 0.5
## 2 Logistic … 0.885 0.012 0.998 0.488 0.505
## 3 Logistic … 0.784 0.352 0.840 0.222 0.596
## 4 Decision … 0.706 0.493 0.734 0.193 0.613
## 5 Decision … 0.115 1 0 0.115 0.5
## 6 Random Fo… 0.886 0.015 0.998 0.562 0.507
## 7 Random Fo… 0.700 0.449 0.733 0.179 0.591
## 8 KNN Model 0.885 0.0132 0.998 0.478 0.506
## # ℹ 1 more variable: AUC <dbl>
write_csv(
model_comparison,
"model_comparison_metrics.csv"
)
ggplot(
model_comparison %>%
filter(Model != "Majority Class Baseline"),
aes(
x = reorder(Model, Sensitivity_Recall),
y = Sensitivity_Recall
)
) +
geom_col(fill = "darkred") +
coord_flip() +
labs(
title = "Model Comparison by Recall",
x = "Model",
y = "Recall / Sensitivity"
) +
theme_minimal()

ggsave(
"07_model_comparison_recall.png",
width = 8,
height = 6
)
############################################################
# 14. Predicted Risk Tiers
############################################################
test_data <- test_data %>%
mutate(
predicted_risk_tier = case_when(
rf_prob >= 0.20 ~ "Very High Risk",
rf_prob >= 0.15 ~ "High Risk",
rf_prob >= 0.10 ~ "Moderate Risk",
TRUE ~ "Low Risk"
),
predicted_risk_tier = factor(
predicted_risk_tier,
levels = c(
"Low Risk",
"Moderate Risk",
"High Risk",
"Very High Risk"
)
)
)
risk_tier_summary <- test_data %>%
group_by(predicted_risk_tier) %>%
summarise(
encounters = n(),
percent_of_test_set =
round(n() / nrow(test_data) * 100, 2),
actual_readmissions =
sum(Readmit_30 == "Readmitted_30"),
actual_readmission_rate =
round(
mean(Readmit_30 == "Readmitted_30") * 100,
2
),
avg_predicted_probability =
round(mean(rf_prob) * 100, 2),
avg_prior_inpatient =
round(mean(number_inpatient), 2),
avg_prior_emergency =
round(mean(number_emergency), 2),
avg_medications =
round(mean(num_medications), 2),
avg_diagnoses =
round(mean(number_diagnoses), 2),
avg_time_in_hospital =
round(mean(time_in_hospital), 2),
.groups = "drop"
) %>%
arrange(desc(actual_readmission_rate))
risk_tier_summary
## # A tibble: 4 × 11
## predicted_risk_tier encounters percent_of_test_set actual_readmissions
## <fct> <int> <dbl> <int>
## 1 Very High Risk 4435 15.2 941
## 2 High Risk 3947 13.6 556
## 3 Moderate Risk 6254 21.5 722
## 4 Low Risk 14495 49.8 1118
## # ℹ 7 more variables: actual_readmission_rate <dbl>,
## # avg_predicted_probability <dbl>, avg_prior_inpatient <dbl>,
## # avg_prior_emergency <dbl>, avg_medications <dbl>, avg_diagnoses <dbl>,
## # avg_time_in_hospital <dbl>
write_csv(
risk_tier_summary,
"predicted_risk_tier_summary.csv"
)
ggplot(
risk_tier_summary,
aes(
x = reorder(
predicted_risk_tier,
actual_readmission_rate
),
y = actual_readmission_rate
)
) +
geom_col(fill = "purple") +
geom_text(
aes(
label = paste0(actual_readmission_rate, "%")
),
vjust = -0.4
) +
labs(
title =
"Actual 30-Day Readmission Rate by Predicted Risk Tier",
x = "Predicted Risk Tier",
y = "Actual 30-Day Readmission Rate (%)"
) +
theme_minimal()

ggsave(
"08_actual_readmission_by_risk_tier.png",
width = 8,
height = 5
)
############################################################
# 15. Save Final Prediction File
############################################################
final_predictions <- test_data %>%
mutate(
knn_prob = knn_prob,
knn_pred = knn_pred
) %>%
select(
# Actual Outcome
Readmit_30,
# Logistic Regression
logit_prob,
logit_pred_50,
logit_pred_15,
# Decision Tree
tree_prob,
tree_pred_50,
tree_pred_15,
# Random Forest
rf_prob,
rf_pred_50,
rf_pred_15,
# KNN
knn_prob,
knn_pred,
# Risk Tier
predicted_risk_tier,
# Business Variables
age,
discharge_group,
time_in_hospital,
num_lab_procedures,
num_procedures,
num_medications,
number_outpatient,
number_emergency,
number_inpatient,
number_diagnoses,
prior_utilization,
insulin_change,
medication_change
)
head(final_predictions)
## # A tibble: 6 × 26
## Readmit_30 logit_prob logit_pred_50 logit_pred_15 tree_prob tree_pred_50
## <fct> <dbl> <fct> <fct> <dbl> <fct>
## 1 Not_Readmitted 0.0639 Not_Readmitted Not_Readmitted 0.354 Not_Readmit…
## 2 Not_Readmitted 0.116 Not_Readmitted Not_Readmitted 0.437 Not_Readmit…
## 3 Not_Readmitted 0.0874 Not_Readmitted Not_Readmitted 0.354 Not_Readmit…
## 4 Not_Readmitted 0.0777 Not_Readmitted Not_Readmitted 0.354 Not_Readmit…
## 5 Not_Readmitted 0.0777 Not_Readmitted Not_Readmitted 0.354 Not_Readmit…
## 6 Not_Readmitted 0.0847 Not_Readmitted Not_Readmitted 0.354 Not_Readmit…
## # ℹ 20 more variables: tree_pred_15 <fct>, rf_prob <dbl>, rf_pred_50 <fct>,
## # rf_pred_15 <fct>, knn_prob <dbl>, knn_pred <fct>,
## # predicted_risk_tier <fct>, age <fct>, discharge_group <fct>,
## # time_in_hospital <dbl>, num_lab_procedures <dbl>, num_procedures <dbl>,
## # num_medications <dbl>, number_outpatient <dbl>, number_emergency <dbl>,
## # number_inpatient <dbl>, number_diagnoses <dbl>, prior_utilization <dbl>,
## # insulin_change <fct>, medication_change <fct>
write_csv(
final_predictions,
"final_test_set_predictions.csv"
)
############################################################
# Additional Executive Summary Table
############################################################
prediction_summary <- final_predictions %>%
summarise(
total_patients = n(),
actual_readmissions =
sum(Readmit_30 == "Readmitted_30"),
actual_readmission_rate =
round(
mean(Readmit_30 == "Readmitted_30") * 100,
2
),
avg_rf_probability =
round(mean(rf_prob) * 100, 2),
avg_logit_probability =
round(mean(logit_prob) * 100, 2),
avg_tree_probability =
round(mean(tree_prob) * 100, 2),
avg_knn_probability =
round(mean(knn_prob) * 100, 2),
rf_high_risk_flagged =
sum(rf_pred_15 == "Readmitted_30"),
logit_high_risk_flagged =
sum(logit_pred_15 == "Readmitted_30"),
tree_high_risk_flagged =
sum(tree_pred_15 == "Readmitted_30"),
knn_high_risk_flagged =
sum(knn_pred == "Readmitted_30")
)
prediction_summary
## # A tibble: 1 × 11
## total_patients actual_readmissions actual_readmission_rate avg_rf_probability
## <int> <int> <dbl> <dbl>
## 1 29131 3337 11.5 11.8
## # ℹ 7 more variables: avg_logit_probability <dbl>, avg_tree_probability <dbl>,
## # avg_knn_probability <dbl>, rf_high_risk_flagged <int>,
## # logit_high_risk_flagged <int>, tree_high_risk_flagged <int>,
## # knn_high_risk_flagged <int>
write_csv(
prediction_summary,
"prediction_summary.csv"
)
############################################################
# 16. Final Business Summary
############################################################
best_auc_model <- model_comparison %>%
filter(!is.na(AUC)) %>%
arrange(desc(AUC)) %>%
slice(1)
best_recall_model <- model_comparison %>%
arrange(desc(Sensitivity_Recall)) %>%
slice(1)
best_balanced_model <- model_comparison %>%
arrange(desc(Balanced_Accuracy)) %>%
slice(1)
business_summary <- tibble(
Item = c(
"Baseline Accuracy",
"Best AUC Model",
"Best AUC",
"Best Recall Model",
"Best Recall",
"Best Balanced Accuracy Model",
"Best Balanced Accuracy",
"Highest Risk Tier Readmission Rate",
"Lowest Risk Tier Readmission Rate",
"Operational Recommendation",
"Primary Business Insight",
"KNN Performance Insight"
),
Value = c(
# Baseline
round(baseline_accuracy, 4),
# AUC
best_auc_model$Model,
round(best_auc_model$AUC, 4),
# Recall
best_recall_model$Model,
round(best_recall_model$Sensitivity_Recall, 4),
# Balanced Accuracy
best_balanced_model$Model,
round(best_balanced_model$Balanced_Accuracy, 4),
# Risk Tier Insights
paste0(
round(
max(risk_tier_summary$actual_readmission_rate),
2
),
"%"
),
paste0(
round(
min(risk_tier_summary$actual_readmission_rate),
2
),
"%"
),
# Recommendations
"Use lower probability thresholds such as 0.15 when hospitals prioritize identifying more high risk patients before discharge.",
"Risk stratification successfully separated low and high readmission populations, with Very High Risk patients showing substantially elevated readmission rates.",
"KNN produced weaker recall and balanced accuracy performance compared to Random Forest and Decision Tree models, limiting operational usefulness for intervention planning."
)
)
business_summary
## # A tibble: 12 × 2
## Item Value
## <chr> <chr>
## 1 Baseline Accuracy 0.8854
## 2 Best AUC Model Logistic Regression - 0.50 Threshold
## 3 Best AUC 0.6604
## 4 Best Recall Model Decision Tree - 0.15 Threshold
## 5 Best Recall 1
## 6 Best Balanced Accuracy Model Decision Tree - 0.50 Threshold
## 7 Best Balanced Accuracy 0.6132
## 8 Highest Risk Tier Readmission Rate 21.22%
## 9 Lowest Risk Tier Readmission Rate 7.71%
## 10 Operational Recommendation Use lower probability thresholds such as …
## 11 Primary Business Insight Risk stratification successfully separate…
## 12 KNN Performance Insight KNN produced weaker recall and balanced a…
write_csv(
business_summary,
"business_summary_for_presentation.csv"
)
############################################################
# Executive Visualization
############################################################
ggplot(
model_comparison %>%
filter(Model != "Majority Class Baseline"),
aes(
x = reorder(Model, Balanced_Accuracy),
y = Balanced_Accuracy,
fill = Model
)
) +
geom_col(show.legend = FALSE) +
coord_flip() +
geom_text(
aes(label = round(Balanced_Accuracy, 3)),
hjust = -0.1
) +
labs(
title = "Balanced Accuracy Across Predictive Models",
x = "Model",
y = "Balanced Accuracy"
) +
ylim(0, 1) +
theme_minimal()

ggsave(
"09_balanced_accuracy_comparison.png",
width = 9,
height = 6
)
############################################################
# High Risk Population Summary
############################################################
high_risk_summary <- risk_tier_summary %>%
filter(predicted_risk_tier %in% c("High Risk", "Very High Risk")) %>%
summarise(
total_high_risk_patients =
sum(encounters),
percent_of_population =
round(sum(percent_of_test_set), 2),
combined_actual_readmissions =
sum(actual_readmissions),
combined_readmission_rate =
round(
weighted.mean(
actual_readmission_rate,
encounters
),
2
)
)
high_risk_summary
## # A tibble: 1 × 4
## total_high_risk_patients percent_of_population combined_actual_readmissions
## <int> <dbl> <int>
## 1 8382 28.8 1497
## # ℹ 1 more variable: combined_readmission_rate <dbl>
write_csv(
high_risk_summary,
"high_risk_population_summary.csv"
)