URL: [https://www.hindawi.com/journals/bmri/2014/781670/tab1/]
## Warning: package 'tidyverse' was built under R version 4.0.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v dplyr 1.0.2
## v tibble 3.0.4 v stringr 1.4.0
## v tidyr 1.1.2 v forcats 0.5.1
## v purrr 0.3.4
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'tibble' was built under R version 4.0.3
## Warning: package 'stringr' was built under R version 4.0.3
## Warning: package 'forcats' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
diaData = read_csv("C:/Users/Anhuynh/Desktop/Data Science Project/Hospital Readmission/dataset_diabetes/diabetic_data.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## encounter_id = col_double(),
## patient_nbr = col_double(),
## admission_type_id = col_double(),
## discharge_disposition_id = col_double(),
## admission_source_id = col_double(),
## time_in_hospital = col_double(),
## num_lab_procedures = col_double(),
## num_procedures = col_double(),
## num_medications = col_double(),
## number_outpatient = col_double(),
## number_emergency = col_double(),
## number_inpatient = col_double(),
## number_diagnoses = col_double()
## )
## See spec(...) for full column specifications.
# Calculate percentage of missing values in data set for choosing target variances
num_na <- function(x, ...) {
y = table(x)
na <- y[names(y) == "?"]
}
encounter_id <- paste(round((num_na(diaData$encounter_id)/ 101766) * 100, 2), "%")
patient_nbr <- paste(round((num_na(diaData$patient_nbr)/ 101766) * 100, 2), "%")
race <- paste(round((num_na(diaData$race)/ 101766) * 100, 2), "%")
age <- paste(round((num_na(diaData$age)/ 101766) * 100, 2), "%")
gender <- paste(round((num_na(diaData$gender)/ 101766) * 100, 2), "%")
weight <- paste(round((num_na(diaData$weight)/ 101766) * 100, 2), "%")
admission_type_id <- paste(round((num_na(diaData$admission_type_id)/ 101766) * 100, 2), "%")
discharge_disposition_id <- paste(round((num_na(diaData$discharge_disposition_id)/ 101766) * 100, 2), "%")
admission_source_id <- paste(round((num_na(diaData$admission_source_id)/ 101766) * 100, 2), "%")
payer_code <- paste(round((num_na(diaData$payer_code)/ 101766) * 100, 2), "%")
num_lab_procedures <- paste(round((num_na(diaData$num_lab_procedures)/ 101766) * 100, 2), "%")
num_procedures <- paste(round((num_na(diaData$num_procedures)/ 101766) * 100, 2), "%")
time_in_hospital <- paste(round((num_na(diaData$time_in_hospital)/ 101766) * 100, 2), "%")
medical_spec <- paste(round((num_na(diaData$medical_specialty)/ 101766) * 100, 2), "%")
num_medications <- paste(round((num_na(diaData$num_medications)/ 101766) * 100, 2), "%")
num_outpatient <- paste(round((num_na(diaData$number_outpatient)/ 101766) * 100, 2), "%")
num_inpatient <- paste(round((num_na(diaData$number_inpatient)/ 101766) * 100, 2), "%")
num_emergency <- paste(round((num_na(diaData$number_emergency)/ 101766) * 100, 2), "%")
num_diagnoses <- paste(round((num_na(diaData$number_diagnoses)/ 101766) * 100, 2), "%")
diag_1 <- paste(round((num_na(diaData$diag_1)/ 101766) * 100, 2), "%")
diag_2 <- paste(round((num_na(diaData$diag_2)/ 101766) * 100, 2), "%")
diag_3 <- paste(round((num_na(diaData$diag_3)/ 101766) * 100, 2), "%")
glu_serum <- paste(round((num_na(diaData$max_glu_serum)/ 101766) * 100, 2), "%")
A1C <- paste(round((num_na(diaData$A1Cresult)/ 101766) * 100, 2), "%")
insulin <- paste(round((num_na(diaData$insulin)/ 101766) * 100, 2), "%")
metformin <- paste(round((num_na(diaData$metformin)/ 101766) * 100, 2), "%")
repaglinide <- paste(round((num_na(diaData$repaglinide)/ 101766) * 100, 2), "%")
nateglinide <- paste(round((num_na(diaData$nateglinide)/ 101766) * 100, 2), "%")
chlorpropamide <- paste(round((num_na(diaData$chlorpropamide)/ 101766) * 100, 2), "%")
glimepiride <- paste(round((num_na(diaData$glimepiride)/ 101766) * 100, 2), "%")
acetohexamide <- paste(round((num_na(diaData$acetohexamide)/ 101766) * 100, 2), "%")
glipizide <- paste(round((num_na(diaData$glipizide)/ 101766) * 100, 2), "%")
glyburide <- paste(round((num_na(diaData$glyburide)/ 101766) * 100, 2), "%")
tolbutamide <- paste(round((num_na(diaData$tolbutamide)/ 101766) * 100, 2), "%")
examide <- paste(round((num_na(diaData$examide)/ 101766) * 100, 2), "%")
citoglipton <- paste(round((num_na(diaData$citoglipton)/ 101766) * 100, 2), "%")
changeMed <- paste(round((num_na(diaData$change)/ 101766) * 100, 2), "%")
diabetesMed <- paste(round((num_na(diaData$diabetesMed)/ 101766) * 100, 2), "%")
readmitted <- paste(round((num_na(diaData$readmitted)/ 101766) * 100, 2), "%")
NA_data <- rbind(encounter_id, patient_nbr, race, age, gender,
weight,admission_type_id,discharge_disposition_id,
admission_source_id,payer_code, num_lab_procedures,num_procedures,
time_in_hospital, medical_spec, num_medications, num_outpatient,
num_inpatient, num_emergency,
num_diagnoses,diag_1, diag_2,diag_3, glu_serum, A1C, insulin, metformin,
repaglinide ,
nateglinide, chlorpropamide, glimepiride, acetohexamide, glipizide, glyburide,
tolbutamide,
examide, citoglipton, changeMed, diabetesMed, readmitted )
colnames(NA_data) <- c("Missing_Value_Percentage")
NA_data
## Missing_Value_Percentage
## encounter_id " %"
## patient_nbr " %"
## race "2.23 %"
## age " %"
## gender " %"
## weight "96.86 %"
## admission_type_id " %"
## discharge_disposition_id " %"
## admission_source_id " %"
## payer_code "39.56 %"
## num_lab_procedures " %"
## num_procedures " %"
## time_in_hospital " %"
## medical_spec "49.08 %"
## num_medications " %"
## num_outpatient " %"
## num_inpatient " %"
## num_emergency " %"
## num_diagnoses " %"
## diag_1 "0.02 %"
## diag_2 "0.35 %"
## diag_3 "1.4 %"
## glu_serum " %"
## A1C " %"
## insulin " %"
## metformin " %"
## repaglinide " %"
## nateglinide " %"
## chlorpropamide " %"
## glimepiride " %"
## acetohexamide " %"
## glipizide " %"
## glyburide " %"
## tolbutamide " %"
## examide " %"
## citoglipton " %"
## changeMed " %"
## diabetesMed " %"
## readmitted " %"
This is information extracted from IDs_mapping.csv of initial data link provided earlier.
# Identify records of patients who are dead/expired/in hospice
id <- table(diaData$discharge_disposition_id)
id_del <- id[-c(11, 13 ,14, 19, 20)]
diaData <- subset(diaData, diaData$discharge_disposition_id %in% names(id_del))
# Remove unused variance (Weight) from initial data set
diaData <- diaData[-c(6)]
# Replace "?" from data set with "No Record" notation
sub_dt1 <- replace(diaData$race, diaData$race %in% c("?"), c("No record") )
sub_dt2 <- replace(diaData$payer_code, diaData$payer_code %in% c("?"), c("No record"))
sub_dt3 <- replace(diaData$medical_specialty, diaData$medical_specialty %in% c("?"), c("No record"))
sub_dt4 <- replace(diaData$diag_1, diaData$diag_1 %in% c("?"), c("No record"))
sub_dt5 <- replace(diaData$diag_2, diaData$diag_2 %in% c("?"), c("No record"))
sub_dt6 <- replace(diaData$diag_3, diaData$diag_3 %in% c("?"), c("No record"))
diaData <- mutate(diaData, race = sub_dt1,
payer_code = sub_dt2,
medical_specialty = sub_dt3,
diag_1 = sub_dt4,
diag_2 = sub_dt5,
diag_3 = sub_dt6 )
# Race
count_race <- table(diaData$race)
df_race <- data.frame(race = c("AfricanAmerican", "Asian", "Caucasian", "Hispanic", "No record", "Other"),
count = c(18772, 628, 74220, 2017, 2234, 1472))
ggplot(data=df_race, aes(x = race, y = count, fill = race)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_race$count, label= df_race$count), vjust=1, color="black", size=3) +
labs(title="Count on Race")
## Warning: Use of `df_race$count` is discouraged. Use `count` instead.
## Warning: Use of `df_race$count` is discouraged. Use `count` instead.
# Gender
count_gender <- table(diaData$gender)
df_gender <- data.frame(gender = c("Female", "Male"), count = c(53454, 45886))
ggplot(data=df_gender, aes(x = gender, y = count, fill = gender)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_gender$count, label= df_gender$count), vjust=1, color="black", size=3) +
labs(title="Count on Gender")
## Warning: Use of `df_gender$count` is discouraged. Use `count` instead.
## Warning: Use of `df_gender$count` is discouraged. Use `count` instead.
# Age
count_age <- table(diaData$age)
df_age <- data.frame(age = c("[0-10)", "[10-20)", "[20-30)", "[30-40)", "[40-50)", "[50-60)", "[60-70)", "[70-80)", "[80-90)", "[90-100)"), count = c(160, 690, 1649, 3764, 9607, 17060, 22059, 25331, 16434, 2589))
ggplot(data=df_age, aes(x = age, y = count, fill = age)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_age$count, label= df_age$count), vjust=1, color="black", size=3) +
labs(title="Count on Age")
## Warning: Use of `df_age$count` is discouraged. Use `count` instead.
## Warning: Use of `df_age$count` is discouraged. Use `count` instead.
# Admission Type
count_ad <- table(diaData$admission_type_id)
df_ad <- data.frame(admission = c("Emergency", "Urgent", "Elective", "Newborn", "Not Available", "NULL", "Trauma Center", "Not Mapped"), count = c(52371, 18132, 18668, 10, 4617, 5207, 18, 320))
ggplot(data=df_ad, aes(x = admission, y = count, fill = admission)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_ad$count, label= df_ad$count), vjust=1, color="black", size=3) +
labs(title="Count on Admission Type")
## Warning: Use of `df_ad$count` is discouraged. Use `count` instead.
## Warning: Use of `df_ad$count` is discouraged. Use `count` instead.
# Readmitted (day)
count_re <- table(diaData$readmitted)
df_re <- data.frame(readmission = c("<30", ">30", "No Record"), count = c(11357, 35545, 54864))
ggplot(data=df_re, aes(x = readmission, y = count, fill = readmission)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_re$count, label= df_re$count), vjust=1, color="black", size=3) +
labs(title="Count on Readmitted day")
## Warning: Use of `df_re$count` is discouraged. Use `count` instead.
## Warning: Use of `df_re$count` is discouraged. Use `count` instead.
# Distribution of the variables counts, or how many patients fall into each bucket of measurements.
hist(diaData$time_in_hospital, col = "green")
rug(diaData$time_in_hospital)
abline(v = median(diaData$time_in_hospital), col = "magenta", lwd = 4)
hist(diaData$number_diagnoses, col = "blue")
rug(diaData$number_diagnoses)
abline(v = median(diaData$number_diagnoses), col = "magenta", lwd = 4)
hist(diaData$num_lab_procedures, col = "gold")
rug(diaData$num_lab_procedures)
abline(v = median(diaData$num_lab_procedures), col = "magenta", lwd = 4)
hist(diaData$num_medications, col = "maroon")
rug(diaData$num_medications)
abline(v = median(diaData$num_medications), col = "magenta", lwd = 4)
#Correlation number_emergency ~ number_diagnoses | A1Cresult / Insulin / Glucose
g <- ggplot(diaData, aes(number_diagnoses, number_emergency))
g + geom_smooth(method = "lm", col = "magenta") + facet_grid( .~A1Cresult) + geom_point(color = "blue",size=1.75) + labs(title = "Number of emergency correlates to A1Cresult test result")
## `geom_smooth()` using formula 'y ~ x'
g + geom_smooth(method = "lm", col ="magenta") + facet_grid( .~insulin) + geom_point(color = "darkgreen",size=1.75) + labs(title = "Number of emergency correlates to Insulin test result")
## `geom_smooth()` using formula 'y ~ x'
g + geom_smooth(method = "lm", col = "magenta") + facet_grid( .~max_glu_serum) + geom_point(color = "gold",size=1.75) + labs(title = "Number of emergency correlates to Glucose serum test result")
## `geom_smooth()` using formula 'y ~ x'
# Comparison of number_emergency ~ number_diagnoses |A1Cresult vs. Glucose
par(mfrow = c(1,2), mar = c(4,4,2,1))
qplot(number_diagnoses, number_emergency, data = diaData, color = A1Cresult, facets = .~A1Cresult) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
qplot(number_diagnoses, number_emergency, data = diaData, color = max_glu_serum, facets = .~max_glu_serum) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
# Comparison of number_emergency ~ A1Cresult vs. Glucose
qplot(number_inpatient, number_outpatient, data = diaData, color = A1Cresult, facets = .~A1Cresult) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
qplot(number_inpatient, number_outpatient, data = diaData, color = max_glu_serum, facets = .~max_glu_serum) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
# Comparison of time_in_hospital ~ number of patient | A1Cresult
qplot(number_inpatient, time_in_hospital, data = diaData, color = A1Cresult, facets = .~A1Cresult) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
qplot(number_outpatient, time_in_hospital, data = diaData, color = A1Cresult, facets = .~A1Cresult) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
# Comparison of time_in_hospital ~ number of patients | Glucose test result
qplot(number_inpatient, time_in_hospital, data = diaData, color = max_glu_serum, facets = .~max_glu_serum) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
qplot(number_outpatient, time_in_hospital, data = diaData, color = max_glu_serum, facets = .~max_glu_serum) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
# Comparison of time_in_hospital ~ number of patients | readmitted days
qplot(number_inpatient, time_in_hospital, data = diaData, color = readmitted, facets = .~readmitted) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
qplot(number_outpatient, time_in_hospital, data = diaData, color = readmitted, facets = .~readmitted) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
# Correlation of number_emergency ~ medical_specialty | discharge_disposition_id | admission_source_id
# Create subset for sorting data
diaData_ord = diaData[order(diaData$number_emergency, decreasing = TRUE), ]
sort_med <- diaData_ord[c(1:length(diaData)), ]
# Plot data with graph
par(mfrow = c(2,2), mar = c(5,4,2,1))
ggplot(data=sort_med, aes(x = medical_specialty, y=number_emergency, fill=medical_specialty)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=sort_med$number_emergency, label=""), vjust=1, color="black", size=3) +
labs(title="Topmost Medical Specialty impact number of emergency") +
theme(legend.title = element_blank(), axis.text.x=element_text(angle=45,hjust=1,vjust=1))
## Warning: Use of `sort_med$number_emergency` is discouraged. Use
## `number_emergency` instead.
ggplot(data=sort_med, aes(x = discharge_disposition_id, y=number_emergency, fill=discharge_disposition_id)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=sort_med$number_emergency, label=""), vjust=1, color="black", size=3) +
labs(title="Topmost Discharge Type impact Number of Emergency") +
theme(legend.title = element_blank(), axis.text.x=element_text(angle=45,hjust=1,vjust=1))
## Warning: Use of `sort_med$number_emergency` is discouraged. Use
## `number_emergency` instead.
ggplot(data=sort_med, aes(x = admission_source_id, y=number_emergency, fill=admission_source_id)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=sort_med$number_emergency, label=""), vjust=1, color="black", size=3) +
labs(title="Topmost Admission Source impact Number of Emergency") +
theme(legend.title = element_blank(), axis.text.x=element_text(angle=45,hjust=1,vjust=1))
## Warning: Use of `sort_med$number_emergency` is discouraged. Use
## `number_emergency` instead.
# Correlation of number_emergency ~ readmitted
par(mfrow = c(2,2), mar = c(5,4,2,1))
ggplot(data=diaData, aes(x = readmitted, y=number_emergency, fill=readmitted)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=diaData$number_emergency, label=""), vjust=1, color="black", size=3) +
labs(title="Number of emergency vary to readmitted day") +
theme(legend.title = element_blank(), axis.text.x=element_text(angle=45,hjust=1,vjust=1))
## Warning: Use of `diaData$number_emergency` is discouraged. Use
## `number_emergency` instead.
# Correlation of number_emergency ~ change
par(mfrow = c(2,2), mar = c(5,4,2,1))
ggplot(data=diaData, aes(x=change, y=number_emergency, fill=change)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=diaData$number_emergency, label=""), vjust=1, color="black", size=3) +
labs(title="Change medication impacts number of emergency") +
theme(legend.title = element_blank(), axis.text.x=element_text(angle=45,hjust=1,vjust=1) )
## Warning: Use of `diaData$number_emergency` is discouraged. Use
## `number_emergency` instead.
library(tidyr)
library(dplyr)
library(ggplot2)
sort_med %>% select(medical_specialty, number_inpatient, number_outpatient) %>%
pivot_longer(., cols = c(number_inpatient, number_outpatient), names_to = "Type_of_patient", values_to = "Number_of_patient") %>%
ggplot(aes(x = Type_of_patient, y = Number_of_patient, fill = medical_specialty)) +
geom_boxplot() +
labs(title="Topmost Medical Specialty vary to Type of Patients")
diaData %>% select(change, number_inpatient, number_outpatient) %>%
pivot_longer(., cols = c(number_inpatient, number_outpatient), names_to = "Change_medication", values_to = "Number_of_patient") %>%
ggplot(aes(x = Change_medication, y = Number_of_patient, fill = change)) +
geom_boxplot() +
labs(title="Change Medication vary to Type of Patients")
diaData %>% select(readmitted, number_inpatient, number_outpatient) %>%
pivot_longer(., cols = c(number_inpatient, number_outpatient), names_to = "Readmitted", values_to = "Number_of_patient") %>%
ggplot(aes(x = Readmitted, y = Number_of_patient, fill = readmitted)) +
geom_boxplot() +
labs(title="Readmitted days vary to Type of Patients")
## Warning: package 'corrplot' was built under R version 4.0.3
## corrplot 0.84 loaded
## Warning: package 'Hmisc' was built under R version 4.0.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Warning: package 'Formula' was built under R version 4.0.3
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## Warning: package 'ggcorrplot' was built under R version 4.0.3
diadt <- cor(diaData[ ,c("time_in_hospital", "num_lab_procedures", "num_procedures",
"num_medications", "number_outpatient", "number_emergency",
"number_inpatient", "number_diagnoses")])
cormat <- diadt
ggcorrplot::ggcorrplot(cormat, title = "Correlation of Numeric Variables")
# devtools::install_github("laresbernardo/lares")
library(lares)
##
## Attaching package: 'lares'
## The following object is masked from 'package:Hmisc':
##
## impute
dia <- subset(diaData, select = c("time_in_hospital", "num_lab_procedures",
"num_procedures","num_medications",
"number_outpatient",
"number_emergency", "number_inpatient",
"number_diagnoses"))
corr_cross(dia, # name of dataset
max_pvalue = 0.05, # display only significant correlations (at 5% level)
top = 10 # display top 10 couples of variables (by correlation coefficient)
)
## Returning only the top 10. You may override with the 'top' argument
## Warning in theme_lares(legend = "top"): Font 'Arial Narrow' is not installed,
## has other name, or can't be found
# Create "visit" feature
diaData_vis <- subset(diaData, number_outpatient > 0 & number_inpatient > 0 & number_emergency > 0)
diaData_vis$CAT <- 1
diaData_vis$CATNAME <- "hospital_visit"
diaData_vis$total_patient <- diaData_vis$number_outpatient + diaData_vis$number_inpatient + diaData_vis$number_emergency
diaData_novis <- subset(diaData, number_outpatient == 0 & number_inpatient == 0 & number_emergency == 0)
diaData_novis$CAT <- 0
diaData_novis$CATNAME <- "no_hospital_visit"
diaData_novis$total_patient <- diaData_novis$number_outpatient +
diaData_novis$number_inpatient +
diaData_novis$number_emergency
# Combine "hospital visit" and "no hospital visit" feature into one dataset.
final_diaData <- rbind(diaData_vis, diaData_novis)
The 23-medication-feature indicates whether the drug was prescribed or there was a change in the dosage. Values: “up” if the dosage was increased during the encounter, “down” if the dosage was decreased, “steady” if the dosage did not change, and “no” if the drug was not prescribed.
The 1st feature is “Steady”, which indicates us how many number of medications the patient is taking steadily in the dosage change.
The 2nd feature is “Up_down”, which indicates us how many number of medications have been increased or decreased in dosage for the patient.
library(dplyr)
f1 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse(metformin == c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse(metformin =="Steady", 1, 0) )
f2 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse(repaglinide == c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse(repaglinide =="Steady", 1, 0) )
f3 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse(chlorpropamide == c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse(chlorpropamide =="Steady", 1, 0) )
f4 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse(glimepiride == c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse(glimepiride =="Steady", 1, 0) )
f5 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse(acetohexamide == c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse(acetohexamide =="Steady", 1, 0) )
f6 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse(glipizide == c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse(glipizide =="Steady", 1, 0) )
f7 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse(glyburide == c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse(glyburide =="Steady", 1, 0) )
f8 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse(tolbutamide == c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse(tolbutamide =="Steady", 1, 0) )
f9 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse(pioglitazone == c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse(pioglitazone =="Steady", 1, 0) )
f10 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse(rosiglitazone == c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse(rosiglitazone =="Steady", 1, 0) )
f11 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse(acarbose == c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse(acarbose =="Steady", 1, 0) )
f12 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse(miglitol == c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse(miglitol =="Steady", 1, 0) )
f13 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse(troglitazone == c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse(troglitazone =="Steady", 1, 0) )
f14 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse(tolazamide == c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse(tolazamide =="Steady", 1, 0) )
f15 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse(examide == c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse(examide =="Steady", 1, 0) )
f16 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse(citoglipton == c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse(citoglipton =="Steady", 1, 0) )
f17 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse(insulin == c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse(insulin =="Steady", 1, 0) )
f18 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse('glyburide-metformin' %in% c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse('glyburide-metformin' == "Steady", 1, 0) )
f19 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse('glipizide-metformin' %in% c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse('glipizide-metformin' == "Steady", 1, 0) )
f20 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse('glimepiride-pioglitazone' %in% c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse('glimepiride-pioglitazone' == "Steady", 1, 0) )
f21 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse('metformin-rosiglitazone' %in% c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse('metformin-rosiglitazone' =="Steady", 1, 0) )
f22 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse('metformin-pioglitazone' %in% c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse('metformin-pioglitazone' =="Steady", 1, 0) )
f23 <- final_diaData %>%
select(1:length(final_diaData)) %>%
mutate(Up_down = ifelse(nateglinide == c("Up", "Down"), 1, 0) ) %>%
mutate (Steady = ifelse(nateglinide =="Steady", 1, 0) )
# Combine "Steady" and "Up_Down" feature into one dataset.
fin_diaData <- rbind(f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15, f16, f17, f18, f19, f20, f21, f22, f23)
Among 50+ variables, only 19 variables were introduced after feature engineering to predict the number of patients who got readmitted under 30 days.
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.0.3
## -- Attaching packages -------------------------------------- tidymodels 0.1.2 --
## v broom 0.7.3 v recipes 0.1.15
## v dials 0.0.9 v rsample 0.0.8
## v infer 0.5.3 v tune 0.1.2
## v modeldata 0.1.0 v workflows 0.2.1
## v parsnip 0.1.5 v yardstick 0.0.7
## Warning: package 'broom' was built under R version 4.0.3
## Warning: package 'dials' was built under R version 4.0.3
## Warning: package 'infer' was built under R version 4.0.3
## Warning: package 'modeldata' was built under R version 4.0.3
## Warning: package 'parsnip' was built under R version 4.0.3
## Warning: package 'recipes' was built under R version 4.0.3
## Warning: package 'rsample' was built under R version 4.0.3
## Warning: package 'tune' was built under R version 4.0.3
## Warning: package 'workflows' was built under R version 4.0.3
## Warning: package 'yardstick' was built under R version 4.0.3
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x yardstick::conf_mat() masks lares::conf_mat()
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::mae() masks lares::mae()
## x yardstick::mape() masks lares::mape()
## x yardstick::rmse() masks lares::rmse()
## x yardstick::rsq() masks lares::rsq()
## x yardstick::spec() masks readr::spec()
## x Hmisc::src() masks dplyr::src()
## x recipes::step() masks stats::step()
## x Hmisc::summarize() masks dplyr::summarize()
## x parsnip::translate() masks Hmisc::translate()
set.seed(1234)
sub_diaData <- subset(fin_diaData, select = c("race", "gender", "age",
"time_in_hospital", "num_lab_procedures",
"num_procedures", "num_medications",
"number_diagnoses", "max_glu_serum",
"A1Cresult", "number_emergency",
"change", "diabetesMed" , "readmitted",
"CAT",
"total_patient", "Up_down",
"Steady"))
library(vtreat)
## Warning: package 'vtreat' was built under R version 4.0.3
## Loading required package: wrapr
## Warning: package 'wrapr' was built under R version 4.0.3
##
## Attaching package: 'wrapr'
## The following object is masked from 'package:dplyr':
##
## coalesce
## The following objects are masked from 'package:tidyr':
##
## pack, unpack
## The following object is masked from 'package:tibble':
##
## view
##
## Attaching package: 'vtreat'
## The following object is masked from 'package:recipes':
##
## prepare
## The following object is masked from 'package:parsnip':
##
## fit
# Encode data
vtr <- vtreat::designTreatmentsZ(sub_diaData, c("race", "gender", "age", "max_glu_serum",
"A1Cresult", "change", "diabetesMed",
"time_in_hospital", "num_lab_procedures",
"num_procedures", "num_medications",
"number_diagnoses", "number_emergency",
"CAT", "total_patient", "Up_down",
"Steady"))
## [1] "vtreat 1.6.2 inspecting inputs Wed Feb 17 22:19:57 2021"
## [1] "designing treatments Wed Feb 17 22:19:57 2021"
## [1] " have initial level statistics Wed Feb 17 22:20:04 2021"
## [1] " scoring treatments Wed Feb 17 22:20:20 2021"
## [1] "have treatment plan Wed Feb 17 22:20:28 2021"
new_dia <- vtreat::prepare(vtr, sub_diaData, extracols = "readmitted")
detach("package:vtreat", unload = TRUE)
new_dia$readmitted <- as.factor(as.character(new_dia$readmitted))
fin_dia <- subset(new_dia, select = c("race_catP", "gender_catP", "age_catP",
"max_glu_serum_catP", "A1Cresult_catP",
"time_in_hospital", "num_lab_procedures",
"num_procedures", "num_medications",
"number_diagnoses", "number_emergency",
"CAT", "total_patient", "Up_down", "Steady",
"change_lev_x_Ch", "change_lev_x_No",
"diabetesMed_lev_x_No", "diabetesMed_lev_x_Yes",
"readmitted"
))
slice_dia <- fin_dia %>%
slice(1:300000)
dia_split <- initial_split(slice_dia, strata = readmitted, prop = 3/4)
dia_train <- training(dia_split) # training set
dia_test <- testing(dia_split) # validation set
# Create the regression model 01.
res1 <- aov(total_patient ~ time_in_hospital * A1Cresult, data = sub_diaData)
print(summary(res1))
## Df Sum Sq Mean Sq F value Pr(>F)
## time_in_hospital 1 949 949.1 279.31 <2e-16 ***
## A1Cresult 3 5989 1996.3 587.48 <2e-16 ***
## time_in_hospital:A1Cresult 3 601 200.4 58.97 <2e-16 ***
## Residuals 1315914 4471552 3.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
glu1 <- aov(total_patient ~ time_in_hospital * max_glu_serum, data = sub_diaData)
print(summary(glu1))
## Df Sum Sq Mean Sq F value Pr(>F)
## time_in_hospital 1 949 949 279.87 <2e-16 ***
## max_glu_serum 3 15285 5095 1502.39 <2e-16 ***
## time_in_hospital:max_glu_serum 3 350 117 34.41 <2e-16 ***
## Residuals 1315914 4462507 3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Create the regression model 02.
res2 <- aov(total_patient ~ time_in_hospital + A1Cresult, data = sub_diaData)
print(summary(res2))
## Df Sum Sq Mean Sq F value Pr(>F)
## time_in_hospital 1 949 949.1 279.3 <2e-16 ***
## A1Cresult 3 5989 1996.3 587.4 <2e-16 ***
## Residuals 1315917 4472153 3.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
glu2 <- aov(total_patient ~ time_in_hospital + max_glu_serum, data = sub_diaData)
print(summary(glu2))
## Df Sum Sq Mean Sq F value Pr(>F)
## time_in_hospital 1 949 949 279.9 <2e-16 ***
## max_glu_serum 3 15285 5095 1502.3 <2e-16 ***
## Residuals 1315917 4462857 3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Compare the two models 01 & 02
print(anova(res1,res2))
## Analysis of Variance Table
##
## Model 1: total_patient ~ time_in_hospital * A1Cresult
## Model 2: total_patient ~ time_in_hospital + A1Cresult
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1315914 4471552
## 2 1315917 4472153 -3 -601.12 58.967 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print(anova(glu1,glu2))
## Analysis of Variance Table
##
## Model 1: total_patient ~ time_in_hospital * max_glu_serum
## Model 2: total_patient ~ time_in_hospital + max_glu_serum
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1315914 4462507
## 2 1315917 4462857 -3 -350.09 34.411 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Results: Model 02 demonstrated significant influence of HbA1c (“A1Cresult”) and max_glu_serum to time in hospital of the patient.
test_c1 <- data.frame(sub_diaData$gender, sub_diaData$A1Cresult)
dt1 = table(sub_diaData$gender, sub_diaData$A1Cresult)
print(dt1)
##
## >7 >8 None Norm
## Female 29486 60421 568744 39767
## Male 27991 63756 491556 34132
## Unknown/Invalid 0 0 69 0
print(chisq.test(dt1))
## Warning in chisq.test(dt1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: dt1
## X-squared = 1214.5, df = 6, p-value < 2.2e-16
test_c1b <- data.frame(sub_diaData$gender, sub_diaData$max_glu_serum)
dt1b = table(sub_diaData$gender, sub_diaData$max_glu_serum)
print(dt1b)
##
## >200 >300 None Norm
## Female 8625 6716 668656 14421
## Male 6348 6256 592273 12558
## Unknown/Invalid 0 0 69 0
print(chisq.test(dt1b))
## Warning in chisq.test(dt1b): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: dt1b
## X-squared = 137.76, df = 6, p-value < 2.2e-16
test_c2 <- data.frame(sub_diaData$race, sub_diaData$A1Cresult)
dt2 = table(sub_diaData$race, sub_diaData$A1Cresult)
print(dt2)
##
## >7 >8 None Norm
## AfricanAmerican 8993 29164 195339 16054
## Asian 690 1127 7222 575
## Caucasian 43631 82593 790809 52141
## Hispanic 1472 4071 20079 2001
## No record 1495 4738 29992 1679
## Other 1196 2484 16928 1449
print(chisq.test(dt2))
##
## Pearson's Chi-squared test
##
## data: dt2
## X-squared = 5449.9, df = 15, p-value < 2.2e-16
test_c2b <- data.frame(sub_diaData$race, sub_diaData$max_glu_serum)
dt2b = table(sub_diaData$race, sub_diaData$max_glu_serum)
print(dt2b)
##
## >200 >300 None Norm
## AfricanAmerican 1219 1334 244651 2346
## Asian 161 138 9154 161
## Caucasian 12259 10511 924002 22402
## Hispanic 621 644 25254 1104
## No record 299 115 37053 437
## Other 414 230 20884 529
print(chisq.test(dt2b))
##
## Pearson's Chi-squared test
##
## data: dt2b
## X-squared = 5577.4, df = 15, p-value < 2.2e-16
test_c3 <- data.frame(sub_diaData$age, sub_diaData$A1Cresult)
dt3 = table(sub_diaData$age, sub_diaData$A1Cresult)
print(dt3)
##
## >7 >8 None Norm
## [0-10) 23 1840 1265 207
## [10-20) 184 4853 5175 713
## [20-30) 437 4991 15341 1334
## [30-40) 1357 9269 37145 3151
## [40-50) 5382 19849 97474 8947
## [50-60) 9798 28704 183264 13432
## [60-70) 13639 25116 240649 14881
## [70-80) 15410 18998 276598 17273
## [80-90) 9614 9246 174018 12167
## [90-100) 1633 1311 29440 1794
print(chisq.test(dt3))
##
## Pearson's Chi-squared test
##
## data: dt3
## X-squared = 54281, df = 27, p-value < 2.2e-16
test_c3b <- data.frame(sub_diaData$age, sub_diaData$max_glu_serum)
dt3b = table(sub_diaData$age, sub_diaData$max_glu_serum)
print(dt3b)
##
## >200 >300 None Norm
## [0-10) 0 0 3335 0
## [10-20) 46 161 10718 0
## [20-30) 253 575 21160 115
## [30-40) 299 736 49105 782
## [40-50) 1403 2116 125948 2185
## [50-60) 2231 2116 227148 3703
## [60-70) 2553 2001 284510 5221
## [70-80) 4025 2967 313352 7935
## [80-90) 3519 1909 193752 5865
## [90-100) 644 391 31970 1173
print(chisq.test(dt3b))
##
## Pearson's Chi-squared test
##
## data: dt3b
## X-squared = 5268.2, df = 27, p-value < 2.2e-16
test_c4 <- data.frame(sub_diaData$change, sub_diaData$A1Cresult)
dt4 = table(sub_diaData$change, sub_diaData$A1Cresult)
print(dt4)
##
## >7 >8 None Norm
## Ch 28014 79833 447143 31303
## No 29463 44344 613226 42596
print(chisq.test(dt4))
##
## Pearson's Chi-squared test
##
## data: dt4
## X-squared = 22573, df = 3, p-value < 2.2e-16
test_c4b <- data.frame(sub_diaData$change, sub_diaData$max_glu_serum)
dt4b = table(sub_diaData$change, sub_diaData$max_glu_serum)
print(dt4b)
##
## >200 >300 None Norm
## Ch 6647 8349 563937 7360
## No 8326 4623 697061 19619
print(chisq.test(dt4b))
##
## Pearson's Chi-squared test
##
## data: dt4b
## X-squared = 5333.3, df = 3, p-value < 2.2e-16
test_c5 <- data.frame(sub_diaData$diabetesMed, sub_diaData$A1Cresult)
dt5 = table(sub_diaData$diabetesMed, sub_diaData$A1Cresult)
print(dt5)
##
## >7 >8 None Norm
## No 12121 12167 278599 20424
## Yes 45356 112010 781770 53475
print(chisq.test(dt5))
##
## Pearson's Chi-squared test
##
## data: dt5
## X-squared = 17033, df = 3, p-value < 2.2e-16
test_c5b <- data.frame(sub_diaData$diabetesMed, sub_diaData$max_glu_serum)
dt5b = table(sub_diaData$diabetesMed, sub_diaData$max_glu_serum)
print(dt5b)
##
## >200 >300 None Norm
## No 3243 1909 306912 11247
## Yes 11730 11063 954086 15732
print(chisq.test(dt5b))
##
## Pearson's Chi-squared test
##
## data: dt5b
## X-squared = 5050.2, df = 3, p-value < 2.2e-16
test_c6 <- data.frame(sub_diaData$readmitted, sub_diaData$A1Cresult)
dt6 = table(sub_diaData$readmitted, sub_diaData$A1Cresult)
print(dt6)
##
## >7 >8 None Norm
## <30 4968 10396 96094 6279
## >30 16399 35581 319861 19159
## NO 36110 78200 644414 48461
print(chisq.test(dt6))
##
## Pearson's Chi-squared test
##
## data: dt6
## X-squared = 934.6, df = 6, p-value < 2.2e-16
test_c6b <- data.frame(sub_diaData$readmitted, sub_diaData$max_glu_serum)
dt6b = table(sub_diaData$readmitted, sub_diaData$max_glu_serum)
print(dt6b)
##
## >200 >300 None Norm
## <30 1472 1518 112033 2714
## >30 5267 5175 372761 7797
## NO 8234 6279 776204 16468
print(chisq.test(dt6b))
##
## Pearson's Chi-squared test
##
## data: dt6b
## X-squared = 1246.8, df = 6, p-value < 2.2e-16
Results:
A1Cresult (HbA1c) and max_glu_serum (Glucose serum) mostly affects to other variables, specific to: - Gender: Female was mostly influenced by 7<HbA1c<8 & Glucose serum>200, while male become impacted by HbA1c>8 - Race: Caucasian was the most impacted by HbA1c>8 & Glucose serum>200 while Asian was the group as the least influenced among 05 groups (Caucasian, African-American, Asian, Hispanic, and Others). - Age: Age group of [50-60] was the most impacted by HbA1c>8 while group of [70-80] got significantly impacted by Glucose serum>200. In the meanwhile, group of [0-10] was the least impacted against the other. - Medication Change: Change medication was responsible for greater impact than no change medication in the treatment. It happened when patients having HbA1c>8 & Glucose serum>200. This is especially true when the patient is taking diabetes medication rather than no taking such medication. - Readmission: Patients had to readmit within 30 days thanks to HbA1c>8 & Glucose serum>300, while with HbA1c>8 & Glucose serum>200, patients would be prone to readmit after 01 month.
# Create cross-validation bootstraps.
dia_train %>%
count(readmitted)
## readmitted n
## 1 <30 20331
## 2 >30 67506
## 3 NO 137164
set.seed(1234)
dia_folds <- dia_train %>%
mutate(readmitted = factor(readmitted)) %>%
bootstraps(3)
dia_folds
## # Bootstrap sampling
## # A tibble: 3 x 2
## splits id
## <list> <chr>
## 1 <split [225K/82.8K]> Bootstrap1
## 2 <split [225K/82.8K]> Bootstrap2
## 3 <split [225K/82.9K]> Bootstrap3
rf_spec <- rand_forest(trees = 200) %>%
set_mode("classification") %>%
set_engine("ranger")
dia_wf <- workflow() %>%
add_formula(readmitted ~.) %>%
add_model(rf_spec)
dia_wf
## == Workflow ====================================================================
## Preprocessor: Formula
## Model: rand_forest()
##
## -- Preprocessor ----------------------------------------------------------------
## readmitted ~ .
##
## -- Model -----------------------------------------------------------------------
## Random Forest Model Specification (classification)
##
## Main Arguments:
## trees = 200
##
## Computational engine: ranger
library(ranger)
## Warning: package 'ranger' was built under R version 4.0.3
doParallel::registerDoParallel()
dia_rs <- fit_resamples(
dia_wf,
resamples = dia_folds,
control = control_resamples(save_pred = TRUE)
)
dia_rs
## # Resampling results
## # Bootstrap sampling
## # A tibble: 3 x 5
## splits id .metrics .notes .predictions
## <list> <chr> <list> <list> <list>
## 1 <split [225K/82.8~ Bootstra~ <tibble [2 x ~ <tibble [0 x ~ <tibble [82,778 x ~
## 2 <split [225K/82.8~ Bootstra~ <tibble [2 x ~ <tibble [0 x ~ <tibble [82,762 x ~
## 3 <split [225K/82.9~ Bootstra~ <tibble [2 x ~ <tibble [0 x ~ <tibble [82,872 x ~
collect_metrics(dia_rs)
## # A tibble: 2 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy multiclass 0.805 3 0.00198 Preprocessor1_Model1
## 2 roc_auc hand_till 0.967 3 0.000337 Preprocessor1_Model1
dia_fit <- last_fit(dia_wf, dia_split)
collect_metrics(dia_fit)
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy multiclass 0.814 Preprocessor1_Model1
## 2 roc_auc hand_till 0.989 Preprocessor1_Model1
dia_rs %>%
collect_predictions() %>%
group_by(id) %>%
ppv(readmitted, .pred_class)
## # A tibble: 3 x 4
## id .metric .estimator .estimate
## <chr> <chr> <chr> <dbl>
## 1 Bootstrap1 ppv macro 0.908
## 2 Bootstrap2 ppv macro 0.911
## 3 Bootstrap3 ppv macro 0.911
dia_rs %>%
collect_predictions() %>%
group_by(id) %>%
roc_curve(readmitted, `.pred_<30`:.pred_NO ) %>%
ggplot(aes(1 - specificity, sensitivity, color = id)) +
geom_abline(lty = 2, color = "gray80", size = 1.5) +
geom_path(show.legend = FALSE, alpha = 0.6, size = 1.2) +
facet_wrap(~.level, ncol = 5) +
coord_equal()
Figure: Plots describe ROC curve from each class of readmission
Observation: We have an ROC curve for each class and each re-sample in this plot. Notice that the points of class were easy for the model to identify.
dia_rs %>%
collect_predictions() %>%
filter(.pred_class != readmitted) %>%
conf_mat(readmitted, .pred_class) %>%
autoplot(type = "heatmap")
Figure: Confusion Matrix of prediction and truth observations
Observation: The classes in readmission was confused with many of the other classes, whereas class of ‘over-30-day’ readmission was mostly confused with class of ‘No’ readmission.
# Save model
dia_wf_model <- dia_fit$.workflow[[1]]
# predict on test set
predict(dia_wf_model, dia_test[74999, ])
## # A tibble: 1 x 1
## .pred_class
## <fct>
## 1 NO
library(caret)
## Warning: package 'caret' was built under R version 4.0.3
##
## Attaching package: 'caret'
## The following objects are masked from 'package:yardstick':
##
## precision, recall, sensitivity, specificity
## The following object is masked from 'package:survival':
##
## cluster
## The following object is masked from 'package:purrr':
##
## lift
predict_rf <- predict(dia_wf_model, dia_test)
confusionMatrix(dia_test$readmitted, predict_rf$.pred_class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction <30 >30 NO
## <30 2209 83 4518
## >30 0 13179 9288
## NO 0 41 45681
##
## Overall Statistics
##
## Accuracy : 0.8143
## 95% CI : (0.8115, 0.817)
## No Information Rate : 0.7932
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5968
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: <30 Class: >30 Class: NO
## Sensitivity 1.00000 0.9907 0.7679
## Specificity 0.93679 0.8495 0.9974
## Pos Pred Value 0.32438 0.5866 0.9991
## Neg Pred Value 1.00000 0.9976 0.5284
## Prevalence 0.02945 0.1774 0.7932
## Detection Rate 0.02945 0.1757 0.6091
## Detection Prevalence 0.09080 0.2996 0.6096
## Balanced Accuracy 0.96840 0.9201 0.8826
# Out-of-sample-error in validation set
OOSE <- 1 - as.numeric(confusionMatrix(dia_test$readmitted, predict_rf$.pred_class)$overall[1])
OOSE
## [1] 0.1857358
test_case <- fin_dia %>%
slice(300001:300020)
predict(dia_wf_model, test_case)
## # A tibble: 20 x 1
## .pred_class
## <fct>
## 1 NO
## 2 NO
## 3 >30
## 4 NO
## 5 NO
## 6 NO
## 7 NO
## 8 NO
## 9 NO
## 10 NO
## 11 NO
## 12 NO
## 13 >30
## 14 NO
## 15 NO
## 16 NO
## 17 NO
## 18 NO
## 19 NO
## 20 >30
## Warning: package 'rlang' was built under R version 4.0.3
##
## Attaching package: 'rlang'
## The following object is masked from 'package:wrapr':
##
## :=
## The following objects are masked from 'package:purrr':
##
## %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
## flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
## splice
## Warning: package 'rpart' was built under R version 4.0.3
##
## Attaching package: 'rpart'
## The following object is masked from 'package:dials':
##
## prune
## Warning: package 'rpart.plot' was built under R version 4.0.3
churn_tree = rpart(readmitted ~ .,
data = dia_train )
rpart.plot(churn_tree)
churn_tree_tst_pred = predict(churn_tree, dia_test, type = "class")
table(predicted = churn_tree_tst_pred, actual = dia_test$readmitted)
## actual
## predicted <30 >30 NO
## <30 0 0 0
## >30 838 1997 862
## NO 5972 20470 44860
# funct to predict accuracy of Classification tree model
calc_acc = function(actual, predicted) {
mean(actual == predicted)
}
# Accuracy
(tree_tst_acc = calc_acc(predicted = churn_tree_tst_pred, actual = dia_test$readmitted))
## [1] 0.6247683
confusionMatrix(churn_tree_tst_pred, dia_test$readmitted)
## Confusion Matrix and Statistics
##
## Reference
## Prediction <30 >30 NO
## <30 0 0 0
## >30 838 1997 862
## NO 5972 20470 44860
##
## Overall Statistics
##
## Accuracy : 0.6248
## 95% CI : (0.6213, 0.6282)
## No Information Rate : 0.6096
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.075
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: <30 Class: >30 Class: NO
## Sensitivity 0.0000 0.08889 0.98115
## Specificity 1.0000 0.96764 0.09683
## Pos Pred Value NaN 0.54017 0.62915
## Neg Pred Value 0.9092 0.71291 0.76684
## Prevalence 0.0908 0.29956 0.60963
## Detection Rate 0.0000 0.02663 0.59814
## Detection Prevalence 0.0000 0.04929 0.95071
## Balanced Accuracy 0.5000 0.52826 0.53899
# Out-of-sample-error in test set
OOSE <- 1 - as.numeric(confusionMatrix(dia_test$readmitted, churn_tree_tst_pred)$overall[1])
OOSE
## [1] 0.3752317