Synopsis

The dataset description

  • The UCI Machine Learning Repository has a data set of 70,000 encounters with patients having a diabetes diagnosis with 50+ features. [https://archive.ics.uci.edu/ml/datasets/Diabetes+130-US+hospitals+for+years+1999-2008#]
  • This data set was used for a statistical study “Impact of HbA1c Measurement on Hospital Readmission Rates: Analysis of 70,000 Clinical Database Patient Records”
  • The Health Facts data used, was an extract representing 10 years (1999–2008) of clinical care at 130 hospitals and integrated delivery networks throughout the United States.
  • The data contains both inpatient and outpatient data, including emergency department, for the same group of patients.
  • List of features and their descriptions in the initial data set:

URL: [https://www.hindawi.com/journals/bmri/2014/781670/tab1/]

1. Data Preparation

Getting Data

## Warning: package 'tidyverse' was built under R version 4.0.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v dplyr   1.0.2
## v tibble  3.0.4     v stringr 1.4.0
## v tidyr   1.1.2     v forcats 0.5.1
## v purrr   0.3.4
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'tibble' was built under R version 4.0.3
## Warning: package 'stringr' was built under R version 4.0.3
## Warning: package 'forcats' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
diaData = read_csv("C:/Users/Anhuynh/Desktop/Data Science Project/Hospital Readmission/dataset_diabetes/diabetic_data.csv")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   encounter_id = col_double(),
##   patient_nbr = col_double(),
##   admission_type_id = col_double(),
##   discharge_disposition_id = col_double(),
##   admission_source_id = col_double(),
##   time_in_hospital = col_double(),
##   num_lab_procedures = col_double(),
##   num_procedures = col_double(),
##   num_medications = col_double(),
##   number_outpatient = col_double(),
##   number_emergency = col_double(),
##   number_inpatient = col_double(),
##   number_diagnoses = col_double()
## )
## See spec(...) for full column specifications.

Finding missing values

# Calculate percentage of missing values in data set for choosing target variances
num_na <- function(x, ...) { 
        y = table(x)
        na <- y[names(y) == "?"]
}

encounter_id <- paste(round((num_na(diaData$encounter_id)/ 101766) * 100, 2), "%")
patient_nbr <- paste(round((num_na(diaData$patient_nbr)/ 101766) * 100, 2), "%")
race <- paste(round((num_na(diaData$race)/ 101766) * 100, 2), "%")
age <- paste(round((num_na(diaData$age)/ 101766) * 100, 2), "%")
gender <- paste(round((num_na(diaData$gender)/ 101766) * 100, 2), "%")
weight <- paste(round((num_na(diaData$weight)/ 101766) * 100, 2), "%")
admission_type_id <- paste(round((num_na(diaData$admission_type_id)/ 101766) * 100, 2), "%")
discharge_disposition_id <- paste(round((num_na(diaData$discharge_disposition_id)/ 101766) * 100, 2), "%")
admission_source_id <- paste(round((num_na(diaData$admission_source_id)/ 101766) * 100, 2), "%")
payer_code <- paste(round((num_na(diaData$payer_code)/ 101766) * 100, 2), "%")
num_lab_procedures <- paste(round((num_na(diaData$num_lab_procedures)/ 101766) * 100, 2), "%")
num_procedures <- paste(round((num_na(diaData$num_procedures)/ 101766) * 100, 2), "%")
time_in_hospital <- paste(round((num_na(diaData$time_in_hospital)/ 101766) * 100, 2), "%")
medical_spec <- paste(round((num_na(diaData$medical_specialty)/ 101766) * 100, 2), "%")
num_medications <- paste(round((num_na(diaData$num_medications)/ 101766) * 100, 2), "%")
num_outpatient <- paste(round((num_na(diaData$number_outpatient)/ 101766) * 100, 2), "%")
num_inpatient <- paste(round((num_na(diaData$number_inpatient)/ 101766) * 100, 2), "%")
num_emergency <- paste(round((num_na(diaData$number_emergency)/ 101766) * 100, 2), "%")
num_diagnoses <- paste(round((num_na(diaData$number_diagnoses)/ 101766) * 100, 2), "%")
diag_1 <- paste(round((num_na(diaData$diag_1)/ 101766) * 100, 2), "%")
diag_2 <- paste(round((num_na(diaData$diag_2)/ 101766) * 100, 2), "%")
diag_3 <- paste(round((num_na(diaData$diag_3)/ 101766) * 100, 2), "%")
glu_serum <- paste(round((num_na(diaData$max_glu_serum)/ 101766) * 100, 2), "%")
A1C <- paste(round((num_na(diaData$A1Cresult)/ 101766) * 100, 2), "%")
insulin <- paste(round((num_na(diaData$insulin)/ 101766) * 100, 2), "%")
metformin <- paste(round((num_na(diaData$metformin)/ 101766) * 100, 2), "%")
repaglinide <- paste(round((num_na(diaData$repaglinide)/ 101766) * 100, 2), "%")
nateglinide <- paste(round((num_na(diaData$nateglinide)/ 101766) * 100, 2), "%")
chlorpropamide <- paste(round((num_na(diaData$chlorpropamide)/ 101766) * 100, 2), "%")
glimepiride <- paste(round((num_na(diaData$glimepiride)/ 101766) * 100, 2), "%")
acetohexamide <- paste(round((num_na(diaData$acetohexamide)/ 101766) * 100, 2), "%")
glipizide <- paste(round((num_na(diaData$glipizide)/ 101766) * 100, 2), "%")
glyburide <- paste(round((num_na(diaData$glyburide)/ 101766) * 100, 2), "%")
tolbutamide <- paste(round((num_na(diaData$tolbutamide)/ 101766) * 100, 2), "%")
examide <- paste(round((num_na(diaData$examide)/ 101766) * 100, 2), "%")
citoglipton <- paste(round((num_na(diaData$citoglipton)/ 101766) * 100, 2), "%")
changeMed <- paste(round((num_na(diaData$change)/ 101766) * 100, 2), "%")
diabetesMed <- paste(round((num_na(diaData$diabetesMed)/ 101766) * 100, 2), "%")
readmitted <- paste(round((num_na(diaData$readmitted)/ 101766) * 100, 2), "%")

NA_data <- rbind(encounter_id, patient_nbr, race, age, gender, 
                 weight,admission_type_id,discharge_disposition_id, 
                 admission_source_id,payer_code, num_lab_procedures,num_procedures, 
                 time_in_hospital, medical_spec, num_medications, num_outpatient,
                 num_inpatient, num_emergency,
                 num_diagnoses,diag_1, diag_2,diag_3, glu_serum, A1C, insulin, metformin,
                 repaglinide ,
                 nateglinide, chlorpropamide, glimepiride, acetohexamide, glipizide, glyburide,
                 tolbutamide,
                 examide, citoglipton, changeMed, diabetesMed, readmitted )

colnames(NA_data) <- c("Missing_Value_Percentage")

NA_data
##                          Missing_Value_Percentage
## encounter_id             " %"                    
## patient_nbr              " %"                    
## race                     "2.23 %"                
## age                      " %"                    
## gender                   " %"                    
## weight                   "96.86 %"               
## admission_type_id        " %"                    
## discharge_disposition_id " %"                    
## admission_source_id      " %"                    
## payer_code               "39.56 %"               
## num_lab_procedures       " %"                    
## num_procedures           " %"                    
## time_in_hospital         " %"                    
## medical_spec             "49.08 %"               
## num_medications          " %"                    
## num_outpatient           " %"                    
## num_inpatient            " %"                    
## num_emergency            " %"                    
## num_diagnoses            " %"                    
## diag_1                   "0.02 %"                
## diag_2                   "0.35 %"                
## diag_3                   "1.4 %"                 
## glu_serum                " %"                    
## A1C                      " %"                    
## insulin                  " %"                    
## metformin                " %"                    
## repaglinide              " %"                    
## nateglinide              " %"                    
## chlorpropamide           " %"                    
## glimepiride              " %"                    
## acetohexamide            " %"                    
## glipizide                " %"                    
## glyburide                " %"                    
## tolbutamide              " %"                    
## examide                  " %"                    
## citoglipton              " %"                    
## changeMed                " %"                    
## diabetesMed              " %"                    
## readmitted               " %"

Remove unused variable, patients who are dead/expired/in hospice to avoid bias.

This is information extracted from IDs_mapping.csv of initial data link provided earlier.

# Identify records of patients who are dead/expired/in hospice
id <- table(diaData$discharge_disposition_id)
id_del <- id[-c(11, 13 ,14, 19, 20)]

diaData <- subset(diaData, diaData$discharge_disposition_id %in% names(id_del))

# Remove unused variance (Weight) from initial data set

diaData <- diaData[-c(6)]

# Replace "?" from data set with "No Record" notation

sub_dt1 <- replace(diaData$race, diaData$race %in% c("?"), c("No record") )
sub_dt2 <- replace(diaData$payer_code, diaData$payer_code %in% c("?"), c("No record"))
sub_dt3 <- replace(diaData$medical_specialty, diaData$medical_specialty %in% c("?"), c("No record"))
sub_dt4 <- replace(diaData$diag_1, diaData$diag_1 %in% c("?"), c("No record"))
sub_dt5 <- replace(diaData$diag_2, diaData$diag_2 %in% c("?"), c("No record"))
sub_dt6 <- replace(diaData$diag_3, diaData$diag_3 %in% c("?"), c("No record"))

diaData <- mutate(diaData, race = sub_dt1, 
                  payer_code = sub_dt2, 
                  medical_specialty = sub_dt3,
                  diag_1 = sub_dt4,
                  diag_2 = sub_dt5,
                  diag_3 = sub_dt6 )

2. Descriptive Analysis with uni-variate, bi-variate, and multivariate evaluation.

Uni-variate evaluation

# Race
count_race <- table(diaData$race)
df_race <- data.frame(race = c("AfricanAmerican", "Asian", "Caucasian", "Hispanic", "No record", "Other"), 
                      count = c(18772, 628, 74220, 2017, 2234, 1472))
ggplot(data=df_race, aes(x = race, y = count, fill = race)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_race$count, label= df_race$count), vjust=1, color="black", size=3) +
labs(title="Count on Race")
## Warning: Use of `df_race$count` is discouraged. Use `count` instead.

## Warning: Use of `df_race$count` is discouraged. Use `count` instead.

# Gender
count_gender <- table(diaData$gender)
df_gender <- data.frame(gender = c("Female", "Male"), count = c(53454, 45886))
ggplot(data=df_gender, aes(x = gender, y = count, fill = gender)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_gender$count, label= df_gender$count), vjust=1, color="black", size=3) +
labs(title="Count on Gender")
## Warning: Use of `df_gender$count` is discouraged. Use `count` instead.
## Warning: Use of `df_gender$count` is discouraged. Use `count` instead.

# Age
count_age <- table(diaData$age)
df_age <- data.frame(age = c("[0-10)", "[10-20)", "[20-30)", "[30-40)", "[40-50)", "[50-60)", "[60-70)", "[70-80)", "[80-90)", "[90-100)"), count = c(160, 690, 1649, 3764, 9607, 17060, 22059, 25331, 16434, 2589))
ggplot(data=df_age, aes(x = age, y = count, fill = age)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_age$count, label= df_age$count), vjust=1, color="black", size=3) +
labs(title="Count on Age")
## Warning: Use of `df_age$count` is discouraged. Use `count` instead.
## Warning: Use of `df_age$count` is discouraged. Use `count` instead.

# Admission Type
count_ad <- table(diaData$admission_type_id)
df_ad <- data.frame(admission = c("Emergency", "Urgent", "Elective", "Newborn", "Not Available", "NULL", "Trauma Center", "Not Mapped"), count = c(52371, 18132, 18668, 10, 4617, 5207, 18, 320))
ggplot(data=df_ad, aes(x = admission, y = count, fill = admission)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_ad$count, label= df_ad$count), vjust=1, color="black", size=3) +
labs(title="Count on Admission Type")
## Warning: Use of `df_ad$count` is discouraged. Use `count` instead.
## Warning: Use of `df_ad$count` is discouraged. Use `count` instead.

# Readmitted (day)

count_re <- table(diaData$readmitted)
df_re <- data.frame(readmission = c("<30", ">30", "No Record"), count = c(11357, 35545, 54864))
ggplot(data=df_re, aes(x = readmission, y = count, fill = readmission)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_re$count, label= df_re$count), vjust=1, color="black", size=3) +
labs(title="Count on Readmitted day")
## Warning: Use of `df_re$count` is discouraged. Use `count` instead.
## Warning: Use of `df_re$count` is discouraged. Use `count` instead.

How the relationship two other variables varies across each independent variables

# Distribution of the variables counts, or how many patients fall into each bucket of measurements.

hist(diaData$time_in_hospital, col = "green")
rug(diaData$time_in_hospital)
abline(v = median(diaData$time_in_hospital), col = "magenta", lwd = 4)

hist(diaData$number_diagnoses, col = "blue")
rug(diaData$number_diagnoses)
abline(v = median(diaData$number_diagnoses), col = "magenta", lwd = 4)

hist(diaData$num_lab_procedures, col = "gold")
rug(diaData$num_lab_procedures)
abline(v = median(diaData$num_lab_procedures), col = "magenta", lwd = 4)

hist(diaData$num_medications, col = "maroon")
rug(diaData$num_medications)
abline(v = median(diaData$num_medications), col = "magenta", lwd = 4)

Testing multivariables

#Correlation number_emergency ~ number_diagnoses | A1Cresult / Insulin / Glucose 
g <- ggplot(diaData, aes(number_diagnoses, number_emergency))
g + geom_smooth(method = "lm", col = "magenta") + facet_grid( .~A1Cresult) + geom_point(color = "blue",size=1.75) + labs(title = "Number of emergency correlates to A1Cresult test result") 
## `geom_smooth()` using formula 'y ~ x'

g + geom_smooth(method = "lm", col ="magenta") + facet_grid( .~insulin) + geom_point(color = "darkgreen",size=1.75) + labs(title = "Number of emergency correlates to Insulin test result") 
## `geom_smooth()` using formula 'y ~ x'

g + geom_smooth(method = "lm", col = "magenta") + facet_grid( .~max_glu_serum) + geom_point(color = "gold",size=1.75) + labs(title = "Number of emergency correlates to Glucose serum test result") 
## `geom_smooth()` using formula 'y ~ x'

# Comparison of number_emergency ~ number_diagnoses |A1Cresult vs. Glucose

par(mfrow = c(1,2), mar = c(4,4,2,1))

qplot(number_diagnoses, number_emergency, data = diaData, color = A1Cresult, facets = .~A1Cresult) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

qplot(number_diagnoses, number_emergency, data = diaData, color = max_glu_serum, facets = .~max_glu_serum) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

# Comparison of number_emergency ~ A1Cresult vs. Glucose 
qplot(number_inpatient, number_outpatient, data = diaData, color = A1Cresult, facets = .~A1Cresult) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

qplot(number_inpatient, number_outpatient, data = diaData, color = max_glu_serum, facets = .~max_glu_serum) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

# Comparison of time_in_hospital ~ number of patient | A1Cresult 

qplot(number_inpatient, time_in_hospital, data = diaData, color = A1Cresult, facets = .~A1Cresult) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

qplot(number_outpatient, time_in_hospital, data = diaData, color = A1Cresult, facets = .~A1Cresult) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

# Comparison of time_in_hospital ~ number of patients | Glucose test result

qplot(number_inpatient, time_in_hospital, data = diaData, color = max_glu_serum, facets = .~max_glu_serum) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

qplot(number_outpatient, time_in_hospital, data = diaData, color = max_glu_serum, facets = .~max_glu_serum) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

# Comparison of time_in_hospital ~ number of patients | readmitted days

qplot(number_inpatient, time_in_hospital, data = diaData, color = readmitted, facets = .~readmitted) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

qplot(number_outpatient, time_in_hospital, data = diaData, color = readmitted, facets = .~readmitted) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

Checking correlation across the diagnoses

# Correlation of number_emergency ~ medical_specialty | discharge_disposition_id | admission_source_id

# Create subset for sorting data

diaData_ord = diaData[order(diaData$number_emergency, decreasing = TRUE), ]
sort_med <- diaData_ord[c(1:length(diaData)), ]

# Plot data with graph
par(mfrow = c(2,2), mar = c(5,4,2,1))

ggplot(data=sort_med, aes(x = medical_specialty, y=number_emergency, fill=medical_specialty)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=sort_med$number_emergency, label=""), vjust=1, color="black", size=3) +
labs(title="Topmost Medical Specialty impact number of emergency") +
theme(legend.title = element_blank(), axis.text.x=element_text(angle=45,hjust=1,vjust=1))
## Warning: Use of `sort_med$number_emergency` is discouraged. Use
## `number_emergency` instead.

ggplot(data=sort_med, aes(x = discharge_disposition_id, y=number_emergency, fill=discharge_disposition_id)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=sort_med$number_emergency, label=""), vjust=1, color="black", size=3) +
labs(title="Topmost Discharge Type impact Number of Emergency") +
theme(legend.title = element_blank(), axis.text.x=element_text(angle=45,hjust=1,vjust=1))
## Warning: Use of `sort_med$number_emergency` is discouraged. Use
## `number_emergency` instead.

ggplot(data=sort_med, aes(x = admission_source_id, y=number_emergency, fill=admission_source_id)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=sort_med$number_emergency, label=""), vjust=1, color="black", size=3) +
labs(title="Topmost Admission Source impact Number of Emergency") +
theme(legend.title = element_blank(), axis.text.x=element_text(angle=45,hjust=1,vjust=1))
## Warning: Use of `sort_med$number_emergency` is discouraged. Use
## `number_emergency` instead.

** Results: **

  • Emergency/Trauma caused the greatest number of emergency visit
  • The highest number of patients discharged to home (Discharge_dispotion_ID: 01) after treatment
  • Emergency Room was the most dominated patient source as compared to the other ones
# Correlation of number_emergency ~ readmitted
par(mfrow = c(2,2), mar = c(5,4,2,1))

ggplot(data=diaData, aes(x = readmitted, y=number_emergency, fill=readmitted)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=diaData$number_emergency, label=""), vjust=1, color="black", size=3) +
labs(title="Number of emergency vary to readmitted day") +
theme(legend.title = element_blank(), axis.text.x=element_text(angle=45,hjust=1,vjust=1))
## Warning: Use of `diaData$number_emergency` is discouraged. Use
## `number_emergency` instead.

# Correlation of number_emergency ~ change
par(mfrow = c(2,2), mar = c(5,4,2,1))

ggplot(data=diaData, aes(x=change, y=number_emergency, fill=change)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=diaData$number_emergency, label=""), vjust=1, color="black", size=3) +
labs(title="Change medication impacts number of emergency") +
theme(legend.title = element_blank(), axis.text.x=element_text(angle=45,hjust=1,vjust=1) )
## Warning: Use of `diaData$number_emergency` is discouraged. Use
## `number_emergency` instead.

Correlation of number of patients ~ medical_specialty | change | readmitted

library(tidyr)
library(dplyr)
library(ggplot2)
sort_med %>% select(medical_specialty, number_inpatient, number_outpatient) %>%
  pivot_longer(., cols = c(number_inpatient, number_outpatient), names_to = "Type_of_patient", values_to = "Number_of_patient") %>%
  ggplot(aes(x = Type_of_patient, y = Number_of_patient, fill = medical_specialty)) +
  geom_boxplot() +
  labs(title="Topmost Medical Specialty vary to Type of Patients")

diaData %>% select(change, number_inpatient, number_outpatient) %>%
  pivot_longer(., cols = c(number_inpatient, number_outpatient), names_to = "Change_medication", values_to = "Number_of_patient") %>%
  ggplot(aes(x = Change_medication, y = Number_of_patient, fill = change)) +
  geom_boxplot() +
  labs(title="Change Medication vary to Type of Patients")

diaData %>% select(readmitted, number_inpatient, number_outpatient) %>%
  pivot_longer(., cols = c(number_inpatient, number_outpatient), names_to = "Readmitted", values_to = "Number_of_patient") %>%
  ggplot(aes(x = Readmitted, y = Number_of_patient, fill = readmitted)) +
  geom_boxplot() +
  labs(title="Readmitted days vary to Type of Patients")

3. Exploratory Data Analysis

## Warning: package 'corrplot' was built under R version 4.0.3
## corrplot 0.84 loaded
## Warning: package 'Hmisc' was built under R version 4.0.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Warning: package 'Formula' was built under R version 4.0.3
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Warning: package 'ggcorrplot' was built under R version 4.0.3
diadt <- cor(diaData[ ,c("time_in_hospital", "num_lab_procedures", "num_procedures",
                         "num_medications", "number_outpatient", "number_emergency",
                         "number_inpatient", "number_diagnoses")])
cormat <- diadt
ggcorrplot::ggcorrplot(cormat, title = "Correlation of Numeric Variables")

# devtools::install_github("laresbernardo/lares")
library(lares)
## 
## Attaching package: 'lares'
## The following object is masked from 'package:Hmisc':
## 
##     impute
dia <- subset(diaData, select = c("time_in_hospital", "num_lab_procedures",
                                      "num_procedures","num_medications",
                                      "number_outpatient",
                                      "number_emergency", "number_inpatient",
                                      "number_diagnoses"))
corr_cross(dia, # name of dataset
  max_pvalue = 0.05, # display only significant correlations (at 5% level)
  top = 10 # display top 10 couples of variables (by correlation coefficient)
)
## Returning only the top 10. You may override with the 'top' argument
## Warning in theme_lares(legend = "top"): Font 'Arial Narrow' is not installed,
## has other name, or can't be found

4. Predictive Analysis

4.1. Feature Engineering

4.1.1. Create a single “visit” feature such as binary number (1: hospital visit; 0: no hospital visit ), combined number_inpatient, number_outpatient, and number_emergency.

# Create "visit" feature
diaData_vis <- subset(diaData, number_outpatient > 0 & number_inpatient > 0 & number_emergency > 0)
diaData_vis$CAT <- 1
diaData_vis$CATNAME <- "hospital_visit"
diaData_vis$total_patient <- diaData_vis$number_outpatient + diaData_vis$number_inpatient + diaData_vis$number_emergency

diaData_novis <- subset(diaData, number_outpatient == 0 & number_inpatient == 0 & number_emergency == 0)
diaData_novis$CAT <- 0
diaData_novis$CATNAME <- "no_hospital_visit"
diaData_novis$total_patient <- diaData_novis$number_outpatient + 
                               diaData_novis$number_inpatient + 
                               diaData_novis$number_emergency

# Combine "hospital visit" and "no hospital visit" feature into one dataset.
final_diaData <- rbind(diaData_vis, diaData_novis)

4.1.2. Create two features showing impact level (up/down/steady) of 23 medication features in dataset.

  • The 23-medication-feature indicates whether the drug was prescribed or there was a change in the dosage. Values: “up” if the dosage was increased during the encounter, “down” if the dosage was decreased, “steady” if the dosage did not change, and “no” if the drug was not prescribed.

  • The 1st feature is “Steady”, which indicates us how many number of medications the patient is taking steadily in the dosage change.

  • The 2nd feature is “Up_down”, which indicates us how many number of medications have been increased or decreased in dosage for the patient.

library(dplyr)

f1 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse(metformin == c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse(metformin =="Steady", 1, 0) )
f2 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse(repaglinide == c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse(repaglinide =="Steady", 1, 0) )
f3 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse(chlorpropamide == c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse(chlorpropamide =="Steady", 1, 0) )
f4 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse(glimepiride == c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse(glimepiride =="Steady", 1, 0) )
f5 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse(acetohexamide == c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse(acetohexamide =="Steady", 1, 0) )
f6 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse(glipizide == c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse(glipizide =="Steady", 1, 0) )
f7 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse(glyburide == c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse(glyburide =="Steady", 1, 0) )
f8 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse(tolbutamide == c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse(tolbutamide =="Steady", 1, 0) )
f9 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse(pioglitazone == c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse(pioglitazone =="Steady", 1, 0) )
f10 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse(rosiglitazone == c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse(rosiglitazone =="Steady", 1, 0) )
f11 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse(acarbose == c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse(acarbose =="Steady", 1, 0) )
f12 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse(miglitol == c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse(miglitol =="Steady", 1, 0) )
f13 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse(troglitazone == c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse(troglitazone =="Steady", 1, 0) )
f14 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse(tolazamide == c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse(tolazamide =="Steady", 1, 0) )
f15 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse(examide == c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse(examide =="Steady", 1, 0) )
f16 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse(citoglipton == c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse(citoglipton =="Steady", 1, 0) )
f17 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse(insulin == c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse(insulin =="Steady", 1, 0) )
f18 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse('glyburide-metformin' %in% c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse('glyburide-metformin' == "Steady", 1, 0) )
f19 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse('glipizide-metformin' %in% c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse('glipizide-metformin' == "Steady", 1, 0) )
f20 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse('glimepiride-pioglitazone' %in% c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse('glimepiride-pioglitazone' == "Steady", 1, 0) )
f21 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse('metformin-rosiglitazone' %in% c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse('metformin-rosiglitazone' =="Steady", 1, 0) )
f22 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse('metformin-pioglitazone' %in% c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse('metformin-pioglitazone' =="Steady", 1, 0) )
f23 <- final_diaData %>%
        select(1:length(final_diaData)) %>%
        mutate(Up_down = ifelse(nateglinide == c("Up", "Down"), 1, 0) ) %>%
        mutate (Steady = ifelse(nateglinide =="Steady", 1, 0) )

# Combine "Steady" and "Up_Down" feature into one dataset.
fin_diaData <- rbind(f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15, f16, f17, f18, f19, f20, f21, f22, f23)

4.2. Feature Selection for Training Set & Test Set

Among 50+ variables, only 19 variables were introduced after feature engineering to predict the number of patients who got readmitted under 30 days.

library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.0.3
## -- Attaching packages -------------------------------------- tidymodels 0.1.2 --
## v broom     0.7.3      v recipes   0.1.15
## v dials     0.0.9      v rsample   0.0.8 
## v infer     0.5.3      v tune      0.1.2 
## v modeldata 0.1.0      v workflows 0.2.1 
## v parsnip   0.1.5      v yardstick 0.0.7
## Warning: package 'broom' was built under R version 4.0.3
## Warning: package 'dials' was built under R version 4.0.3
## Warning: package 'infer' was built under R version 4.0.3
## Warning: package 'modeldata' was built under R version 4.0.3
## Warning: package 'parsnip' was built under R version 4.0.3
## Warning: package 'recipes' was built under R version 4.0.3
## Warning: package 'rsample' was built under R version 4.0.3
## Warning: package 'tune' was built under R version 4.0.3
## Warning: package 'workflows' was built under R version 4.0.3
## Warning: package 'yardstick' was built under R version 4.0.3
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x yardstick::conf_mat() masks lares::conf_mat()
## x scales::discard()     masks purrr::discard()
## x dplyr::filter()       masks stats::filter()
## x recipes::fixed()      masks stringr::fixed()
## x dplyr::lag()          masks stats::lag()
## x yardstick::mae()      masks lares::mae()
## x yardstick::mape()     masks lares::mape()
## x yardstick::rmse()     masks lares::rmse()
## x yardstick::rsq()      masks lares::rsq()
## x yardstick::spec()     masks readr::spec()
## x Hmisc::src()          masks dplyr::src()
## x recipes::step()       masks stats::step()
## x Hmisc::summarize()    masks dplyr::summarize()
## x parsnip::translate()  masks Hmisc::translate()
set.seed(1234)

sub_diaData <- subset(fin_diaData, select = c("race", "gender", "age", 
                                                "time_in_hospital", "num_lab_procedures", 
                                                "num_procedures", "num_medications", 
                                                "number_diagnoses", "max_glu_serum",
                                                "A1Cresult", "number_emergency",
                                                "change", "diabetesMed" , "readmitted",
                                                "CAT", 
                                                "total_patient", "Up_down",
                                                "Steady"))
library(vtreat)
## Warning: package 'vtreat' was built under R version 4.0.3
## Loading required package: wrapr
## Warning: package 'wrapr' was built under R version 4.0.3
## 
## Attaching package: 'wrapr'
## The following object is masked from 'package:dplyr':
## 
##     coalesce
## The following objects are masked from 'package:tidyr':
## 
##     pack, unpack
## The following object is masked from 'package:tibble':
## 
##     view
## 
## Attaching package: 'vtreat'
## The following object is masked from 'package:recipes':
## 
##     prepare
## The following object is masked from 'package:parsnip':
## 
##     fit
# Encode data
vtr <- vtreat::designTreatmentsZ(sub_diaData, c("race", "gender", "age", "max_glu_serum",
                                              "A1Cresult", "change", "diabetesMed",
                                              "time_in_hospital", "num_lab_procedures",
                                              "num_procedures", "num_medications",
                                              "number_diagnoses", "number_emergency",
                                              "CAT", "total_patient", "Up_down",
                                              "Steady"))
## [1] "vtreat 1.6.2 inspecting inputs Wed Feb 17 22:19:57 2021"
## [1] "designing treatments Wed Feb 17 22:19:57 2021"
## [1] " have initial level statistics Wed Feb 17 22:20:04 2021"
## [1] " scoring treatments Wed Feb 17 22:20:20 2021"
## [1] "have treatment plan Wed Feb 17 22:20:28 2021"
new_dia <- vtreat::prepare(vtr, sub_diaData, extracols = "readmitted")

detach("package:vtreat", unload = TRUE)
new_dia$readmitted <- as.factor(as.character(new_dia$readmitted))
fin_dia <- subset(new_dia, select = c("race_catP", "gender_catP", "age_catP",
                                          "max_glu_serum_catP", "A1Cresult_catP",
                                          "time_in_hospital", "num_lab_procedures",
                                          "num_procedures", "num_medications",
                                          "number_diagnoses", "number_emergency",
                                          "CAT", "total_patient", "Up_down", "Steady",
                                          "change_lev_x_Ch", "change_lev_x_No",
                                          "diabetesMed_lev_x_No", "diabetesMed_lev_x_Yes",
                                          "readmitted"
                                          ))

slice_dia <- fin_dia %>%
  slice(1:300000)

dia_split <- initial_split(slice_dia, strata = readmitted, prop = 3/4)
dia_train <- training(dia_split) # training set
dia_test <- testing(dia_split) # validation set

4.3. Data Modeling

    1. Analysis of Co-variance (AOV)
    1. R-Chi Square Test
    1. Random Forest (Classification)
    1. Decision Tree

4.3.1. Analysis of Co-variance (AOV)

This is to closely analyze the interaction of A1Cresult (HbA1c) and max_glu_serum (Glucose serum) to number of hospital visits and time in hospital of the patient.

# Create the regression model 01.
res1 <- aov(total_patient ~ time_in_hospital * A1Cresult, data = sub_diaData)
print(summary(res1))
##                                 Df  Sum Sq Mean Sq F value Pr(>F)    
## time_in_hospital                 1     949   949.1  279.31 <2e-16 ***
## A1Cresult                        3    5989  1996.3  587.48 <2e-16 ***
## time_in_hospital:A1Cresult       3     601   200.4   58.97 <2e-16 ***
## Residuals                  1315914 4471552     3.4                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
glu1 <- aov(total_patient ~ time_in_hospital * max_glu_serum, data = sub_diaData)
print(summary(glu1))
##                                     Df  Sum Sq Mean Sq F value Pr(>F)    
## time_in_hospital                     1     949     949  279.87 <2e-16 ***
## max_glu_serum                        3   15285    5095 1502.39 <2e-16 ***
## time_in_hospital:max_glu_serum       3     350     117   34.41 <2e-16 ***
## Residuals                      1315914 4462507       3                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Create the regression model 02.
res2 <- aov(total_patient ~ time_in_hospital + A1Cresult, data = sub_diaData)
print(summary(res2))
##                       Df  Sum Sq Mean Sq F value Pr(>F)    
## time_in_hospital       1     949   949.1   279.3 <2e-16 ***
## A1Cresult              3    5989  1996.3   587.4 <2e-16 ***
## Residuals        1315917 4472153     3.4                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
glu2 <- aov(total_patient ~ time_in_hospital + max_glu_serum, data = sub_diaData)
print(summary(glu2))
##                       Df  Sum Sq Mean Sq F value Pr(>F)    
## time_in_hospital       1     949     949   279.9 <2e-16 ***
## max_glu_serum          3   15285    5095  1502.3 <2e-16 ***
## Residuals        1315917 4462857       3                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Compare the two models 01 & 02
print(anova(res1,res2))
## Analysis of Variance Table
## 
## Model 1: total_patient ~ time_in_hospital * A1Cresult
## Model 2: total_patient ~ time_in_hospital + A1Cresult
##    Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1 1315914 4471552                                  
## 2 1315917 4472153 -3   -601.12 58.967 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print(anova(glu1,glu2))
## Analysis of Variance Table
## 
## Model 1: total_patient ~ time_in_hospital * max_glu_serum
## Model 2: total_patient ~ time_in_hospital + max_glu_serum
##    Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1 1315914 4462507                                  
## 2 1315917 4462857 -3   -350.09 34.411 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Results: Model 02 demonstrated significant influence of HbA1c (“A1Cresult”) and max_glu_serum to time in hospital of the patient.

4.3.2. R-Chi Square Test

This is to determine if A1Cresult (HbA1c) had a significant correlation with other variables in the dataset.

test_c1 <- data.frame(sub_diaData$gender, sub_diaData$A1Cresult)
dt1 = table(sub_diaData$gender, sub_diaData$A1Cresult)
print(dt1)
##                  
##                       >7     >8   None   Norm
##   Female           29486  60421 568744  39767
##   Male             27991  63756 491556  34132
##   Unknown/Invalid      0      0     69      0
print(chisq.test(dt1))
## Warning in chisq.test(dt1): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  dt1
## X-squared = 1214.5, df = 6, p-value < 2.2e-16
test_c1b <- data.frame(sub_diaData$gender, sub_diaData$max_glu_serum)
dt1b = table(sub_diaData$gender, sub_diaData$max_glu_serum)
print(dt1b)
##                  
##                     >200   >300   None   Norm
##   Female            8625   6716 668656  14421
##   Male              6348   6256 592273  12558
##   Unknown/Invalid      0      0     69      0
print(chisq.test(dt1b))
## Warning in chisq.test(dt1b): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  dt1b
## X-squared = 137.76, df = 6, p-value < 2.2e-16
test_c2 <- data.frame(sub_diaData$race, sub_diaData$A1Cresult)
dt2 = table(sub_diaData$race, sub_diaData$A1Cresult)
print(dt2)
##                  
##                       >7     >8   None   Norm
##   AfricanAmerican   8993  29164 195339  16054
##   Asian              690   1127   7222    575
##   Caucasian        43631  82593 790809  52141
##   Hispanic          1472   4071  20079   2001
##   No record         1495   4738  29992   1679
##   Other             1196   2484  16928   1449
print(chisq.test(dt2))
## 
##  Pearson's Chi-squared test
## 
## data:  dt2
## X-squared = 5449.9, df = 15, p-value < 2.2e-16
test_c2b <- data.frame(sub_diaData$race, sub_diaData$max_glu_serum)
dt2b = table(sub_diaData$race, sub_diaData$max_glu_serum)
print(dt2b)
##                  
##                     >200   >300   None   Norm
##   AfricanAmerican   1219   1334 244651   2346
##   Asian              161    138   9154    161
##   Caucasian        12259  10511 924002  22402
##   Hispanic           621    644  25254   1104
##   No record          299    115  37053    437
##   Other              414    230  20884    529
print(chisq.test(dt2b))
## 
##  Pearson's Chi-squared test
## 
## data:  dt2b
## X-squared = 5577.4, df = 15, p-value < 2.2e-16
test_c3 <- data.frame(sub_diaData$age, sub_diaData$A1Cresult)
dt3 = table(sub_diaData$age, sub_diaData$A1Cresult)
print(dt3)
##           
##                >7     >8   None   Norm
##   [0-10)       23   1840   1265    207
##   [10-20)     184   4853   5175    713
##   [20-30)     437   4991  15341   1334
##   [30-40)    1357   9269  37145   3151
##   [40-50)    5382  19849  97474   8947
##   [50-60)    9798  28704 183264  13432
##   [60-70)   13639  25116 240649  14881
##   [70-80)   15410  18998 276598  17273
##   [80-90)    9614   9246 174018  12167
##   [90-100)   1633   1311  29440   1794
print(chisq.test(dt3))
## 
##  Pearson's Chi-squared test
## 
## data:  dt3
## X-squared = 54281, df = 27, p-value < 2.2e-16
test_c3b <- data.frame(sub_diaData$age, sub_diaData$max_glu_serum)
dt3b = table(sub_diaData$age, sub_diaData$max_glu_serum)
print(dt3b)
##           
##              >200   >300   None   Norm
##   [0-10)        0      0   3335      0
##   [10-20)      46    161  10718      0
##   [20-30)     253    575  21160    115
##   [30-40)     299    736  49105    782
##   [40-50)    1403   2116 125948   2185
##   [50-60)    2231   2116 227148   3703
##   [60-70)    2553   2001 284510   5221
##   [70-80)    4025   2967 313352   7935
##   [80-90)    3519   1909 193752   5865
##   [90-100)    644    391  31970   1173
print(chisq.test(dt3b))
## 
##  Pearson's Chi-squared test
## 
## data:  dt3b
## X-squared = 5268.2, df = 27, p-value < 2.2e-16
test_c4 <- data.frame(sub_diaData$change, sub_diaData$A1Cresult)
dt4 = table(sub_diaData$change, sub_diaData$A1Cresult)
print(dt4)
##     
##          >7     >8   None   Norm
##   Ch  28014  79833 447143  31303
##   No  29463  44344 613226  42596
print(chisq.test(dt4))
## 
##  Pearson's Chi-squared test
## 
## data:  dt4
## X-squared = 22573, df = 3, p-value < 2.2e-16
test_c4b <- data.frame(sub_diaData$change, sub_diaData$max_glu_serum)
dt4b = table(sub_diaData$change, sub_diaData$max_glu_serum)
print(dt4b)
##     
##        >200   >300   None   Norm
##   Ch   6647   8349 563937   7360
##   No   8326   4623 697061  19619
print(chisq.test(dt4b))
## 
##  Pearson's Chi-squared test
## 
## data:  dt4b
## X-squared = 5333.3, df = 3, p-value < 2.2e-16
test_c5 <- data.frame(sub_diaData$diabetesMed, sub_diaData$A1Cresult)
dt5 = table(sub_diaData$diabetesMed, sub_diaData$A1Cresult)
print(dt5)
##      
##           >7     >8   None   Norm
##   No   12121  12167 278599  20424
##   Yes  45356 112010 781770  53475
print(chisq.test(dt5))
## 
##  Pearson's Chi-squared test
## 
## data:  dt5
## X-squared = 17033, df = 3, p-value < 2.2e-16
test_c5b <- data.frame(sub_diaData$diabetesMed, sub_diaData$max_glu_serum)
dt5b = table(sub_diaData$diabetesMed, sub_diaData$max_glu_serum)
print(dt5b)
##      
##         >200   >300   None   Norm
##   No    3243   1909 306912  11247
##   Yes  11730  11063 954086  15732
print(chisq.test(dt5b))
## 
##  Pearson's Chi-squared test
## 
## data:  dt5b
## X-squared = 5050.2, df = 3, p-value < 2.2e-16
test_c6 <- data.frame(sub_diaData$readmitted, sub_diaData$A1Cresult)
dt6 = table(sub_diaData$readmitted, sub_diaData$A1Cresult)
print(dt6)
##      
##           >7     >8   None   Norm
##   <30   4968  10396  96094   6279
##   >30  16399  35581 319861  19159
##   NO   36110  78200 644414  48461
print(chisq.test(dt6))
## 
##  Pearson's Chi-squared test
## 
## data:  dt6
## X-squared = 934.6, df = 6, p-value < 2.2e-16
test_c6b <- data.frame(sub_diaData$readmitted, sub_diaData$max_glu_serum)
dt6b = table(sub_diaData$readmitted, sub_diaData$max_glu_serum)
print(dt6b)
##      
##         >200   >300   None   Norm
##   <30   1472   1518 112033   2714
##   >30   5267   5175 372761   7797
##   NO    8234   6279 776204  16468
print(chisq.test(dt6b))
## 
##  Pearson's Chi-squared test
## 
## data:  dt6b
## X-squared = 1246.8, df = 6, p-value < 2.2e-16

Results:

A1Cresult (HbA1c) and max_glu_serum (Glucose serum) mostly affects to other variables, specific to: - Gender: Female was mostly influenced by 7<HbA1c<8 & Glucose serum>200, while male become impacted by HbA1c>8 - Race: Caucasian was the most impacted by HbA1c>8 & Glucose serum>200 while Asian was the group as the least influenced among 05 groups (Caucasian, African-American, Asian, Hispanic, and Others). - Age: Age group of [50-60] was the most impacted by HbA1c>8 while group of [70-80] got significantly impacted by Glucose serum>200. In the meanwhile, group of [0-10] was the least impacted against the other. - Medication Change: Change medication was responsible for greater impact than no change medication in the treatment. It happened when patients having HbA1c>8 & Glucose serum>200. This is especially true when the patient is taking diabetes medication rather than no taking such medication. - Readmission: Patients had to readmit within 30 days thanks to HbA1c>8 & Glucose serum>300, while with HbA1c>8 & Glucose serum>200, patients would be prone to readmit after 01 month.

4.3.3. Random Forest (Classification)

Create cross-validation bootstraps

# Create cross-validation bootstraps.
dia_train %>%
  count(readmitted)
##   readmitted      n
## 1        <30  20331
## 2        >30  67506
## 3         NO 137164
set.seed(1234)
dia_folds <- dia_train %>%
  mutate(readmitted = factor(readmitted)) %>%
  bootstraps(3)

dia_folds
## # Bootstrap sampling 
## # A tibble: 3 x 2
##   splits               id        
##   <list>               <chr>     
## 1 <split [225K/82.8K]> Bootstrap1
## 2 <split [225K/82.8K]> Bootstrap2
## 3 <split [225K/82.9K]> Bootstrap3

Let’s create a random forest model and set up a model workflow with the model and a formula pre-processor.

rf_spec <- rand_forest(trees = 200) %>%
  set_mode("classification") %>%
  set_engine("ranger")

dia_wf <- workflow() %>%
  add_formula(readmitted ~.) %>%
  add_model(rf_spec)

dia_wf
## == Workflow ====================================================================
## Preprocessor: Formula
## Model: rand_forest()
## 
## -- Preprocessor ----------------------------------------------------------------
## readmitted ~ .
## 
## -- Model -----------------------------------------------------------------------
## Random Forest Model Specification (classification)
## 
## Main Arguments:
##   trees = 200
## 
## Computational engine: ranger

Let’s fit the random forest model to the bootstrap re-samples.

library(ranger)
## Warning: package 'ranger' was built under R version 4.0.3
doParallel::registerDoParallel()
dia_rs <- fit_resamples(
  dia_wf,
  resamples = dia_folds,
  control = control_resamples(save_pred = TRUE)
)

dia_rs
## # Resampling results
## # Bootstrap sampling 
## # A tibble: 3 x 5
##   splits             id        .metrics       .notes         .predictions       
##   <list>             <chr>     <list>         <list>         <list>             
## 1 <split [225K/82.8~ Bootstra~ <tibble [2 x ~ <tibble [0 x ~ <tibble [82,778 x ~
## 2 <split [225K/82.8~ Bootstra~ <tibble [2 x ~ <tibble [0 x ~ <tibble [82,762 x ~
## 3 <split [225K/82.9~ Bootstra~ <tibble [2 x ~ <tibble [0 x ~ <tibble [82,872 x ~

Model Evaluation

collect_metrics(dia_rs)
## # A tibble: 2 x 6
##   .metric  .estimator  mean     n  std_err .config             
##   <chr>    <chr>      <dbl> <int>    <dbl> <chr>               
## 1 accuracy multiclass 0.805     3 0.00198  Preprocessor1_Model1
## 2 roc_auc  hand_till  0.967     3 0.000337 Preprocessor1_Model1
dia_fit <- last_fit(dia_wf, dia_split)
collect_metrics(dia_fit)
## # A tibble: 2 x 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy multiclass     0.814 Preprocessor1_Model1
## 2 roc_auc  hand_till      0.989 Preprocessor1_Model1
dia_rs %>%
  collect_predictions() %>%
  group_by(id) %>%
  ppv(readmitted, .pred_class)
## # A tibble: 3 x 4
##   id         .metric .estimator .estimate
##   <chr>      <chr>   <chr>          <dbl>
## 1 Bootstrap1 ppv     macro          0.908
## 2 Bootstrap2 ppv     macro          0.911
## 3 Bootstrap3 ppv     macro          0.911

Compute ROC curves for each class

dia_rs %>%
  collect_predictions() %>%
  group_by(id) %>%
  roc_curve(readmitted, `.pred_<30`:.pred_NO ) %>%
  ggplot(aes(1 - specificity, sensitivity, color = id)) +
  geom_abline(lty = 2, color = "gray80", size = 1.5) +
  geom_path(show.legend = FALSE, alpha = 0.6, size = 1.2) +
  facet_wrap(~.level, ncol = 5) +
  coord_equal()

Figure: Plots describe ROC curve from each class of readmission

Observation: We have an ROC curve for each class and each re-sample in this plot. Notice that the points of class were easy for the model to identify.

dia_rs %>%
  collect_predictions() %>%
  filter(.pred_class != readmitted) %>%
  conf_mat(readmitted, .pred_class) %>%
  autoplot(type = "heatmap")

Figure: Confusion Matrix of prediction and truth observations

Observation: The classes in readmission was confused with many of the other classes, whereas class of ‘over-30-day’ readmission was mostly confused with class of ‘No’ readmission.

# Save model
dia_wf_model <- dia_fit$.workflow[[1]]

# predict on test set
predict(dia_wf_model, dia_test[74999, ])
## # A tibble: 1 x 1
##   .pred_class
##   <fct>      
## 1 NO

Out-of-sample-error

library(caret)
## Warning: package 'caret' was built under R version 4.0.3
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:yardstick':
## 
##     precision, recall, sensitivity, specificity
## The following object is masked from 'package:survival':
## 
##     cluster
## The following object is masked from 'package:purrr':
## 
##     lift
predict_rf <- predict(dia_wf_model, dia_test)

confusionMatrix(dia_test$readmitted, predict_rf$.pred_class)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   <30   >30    NO
##        <30  2209    83  4518
##        >30     0 13179  9288
##        NO      0    41 45681
## 
## Overall Statistics
##                                          
##                Accuracy : 0.8143         
##                  95% CI : (0.8115, 0.817)
##     No Information Rate : 0.7932         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.5968         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
## 
## Statistics by Class:
## 
##                      Class: <30 Class: >30 Class: NO
## Sensitivity             1.00000     0.9907    0.7679
## Specificity             0.93679     0.8495    0.9974
## Pos Pred Value          0.32438     0.5866    0.9991
## Neg Pred Value          1.00000     0.9976    0.5284
## Prevalence              0.02945     0.1774    0.7932
## Detection Rate          0.02945     0.1757    0.6091
## Detection Prevalence    0.09080     0.2996    0.6096
## Balanced Accuracy       0.96840     0.9201    0.8826
# Out-of-sample-error in validation set
OOSE <- 1 - as.numeric(confusionMatrix(dia_test$readmitted, predict_rf$.pred_class)$overall[1])

OOSE
## [1] 0.1857358

Predict class of readmission in 20 extracted test cases (out-of-training data set)

test_case <- fin_dia %>%
  slice(300001:300020)

predict(dia_wf_model, test_case)
## # A tibble: 20 x 1
##    .pred_class
##    <fct>      
##  1 NO         
##  2 NO         
##  3 >30        
##  4 NO         
##  5 NO         
##  6 NO         
##  7 NO         
##  8 NO         
##  9 NO         
## 10 NO         
## 11 NO         
## 12 NO         
## 13 >30        
## 14 NO         
## 15 NO         
## 16 NO         
## 17 NO         
## 18 NO         
## 19 NO         
## 20 >30

4.3.4. Decision Tree

## Warning: package 'rlang' was built under R version 4.0.3
## 
## Attaching package: 'rlang'
## The following object is masked from 'package:wrapr':
## 
##     :=
## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
##     flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
##     splice
## Warning: package 'rpart' was built under R version 4.0.3
## 
## Attaching package: 'rpart'
## The following object is masked from 'package:dials':
## 
##     prune
## Warning: package 'rpart.plot' was built under R version 4.0.3
churn_tree = rpart(readmitted ~ ., 
              data = dia_train )

rpart.plot(churn_tree)

churn_tree_tst_pred = predict(churn_tree, dia_test, type = "class")
table(predicted = churn_tree_tst_pred, actual = dia_test$readmitted)
##          actual
## predicted   <30   >30    NO
##       <30     0     0     0
##       >30   838  1997   862
##       NO   5972 20470 44860
# funct to predict accuracy of Classification tree model
calc_acc = function(actual, predicted) {
  mean(actual == predicted)
}

# Accuracy
(tree_tst_acc = calc_acc(predicted = churn_tree_tst_pred, actual = dia_test$readmitted))
## [1] 0.6247683
confusionMatrix(churn_tree_tst_pred, dia_test$readmitted)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   <30   >30    NO
##        <30     0     0     0
##        >30   838  1997   862
##        NO   5972 20470 44860
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6248          
##                  95% CI : (0.6213, 0.6282)
##     No Information Rate : 0.6096          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.075           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: <30 Class: >30 Class: NO
## Sensitivity              0.0000    0.08889   0.98115
## Specificity              1.0000    0.96764   0.09683
## Pos Pred Value              NaN    0.54017   0.62915
## Neg Pred Value           0.9092    0.71291   0.76684
## Prevalence               0.0908    0.29956   0.60963
## Detection Rate           0.0000    0.02663   0.59814
## Detection Prevalence     0.0000    0.04929   0.95071
## Balanced Accuracy        0.5000    0.52826   0.53899
# Out-of-sample-error in test set
OOSE <- 1 - as.numeric(confusionMatrix(dia_test$readmitted, churn_tree_tst_pred)$overall[1])
OOSE
## [1] 0.3752317