*What factors collected from the United States’ Hospitals will help predict diabetes readmission.
Center for Clinical and Transnational Research, Virginia Commonwealth University https://archive.ics.uci.edu/ml/datasets/Diabetes+130-US+hospitals+for+years+1999-2008
18 Midwest 58 Northeast 28 South 16 West
Note that a portion of hospitals were labeled as having a bed size of less than 100. And 14 hospitals had a bedsize > 500
This analysis is an improvement on another class project I participated in. The link to the project: https://github.com/ElliotSiegel/Data201/blob/main/Data201_Project2_FINAL.ipynb
There are some improvements from the previous project that need to be made to better understand what factors will help predict diabetes readmission rate. This analysis will fix the few flaws to grasp a better understanding of what this data can unfold.
The reason we choose this dataset is because in my case, I have a long family history of diabetes and another member’s family member is a doctor.
The Data consists of around 101,766 rows - one row per encounter ID - and 50 columns.
library(corrplot)
## corrplot 0.90 loaded
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.2
library("knitr")
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(splus2R)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.1.2
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.5 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x plotly::filter() masks dplyr::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
# Setting working directory
setwd("C:/Users/Jerem/OneDrive/Documents/Montgomery College/Fall 2021/DATA 110/Final Project/dataset_diabetes")
# Provide a fast and friendly way to read rectangular data
library(readr)
diabetes <- read_csv("diabetic_data.csv")
## Rows: 101766 Columns: 50
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (37): race, gender, age, weight, payer_code, medical_specialty, diag_1, ...
## dbl (13): encounter_id, patient_nbr, admission_type_id, discharge_dispositio...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
#View(diabetes)
str(diabetes)
## spec_tbl_df [101,766 x 50] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ encounter_id : num [1:101766] 2278392 149190 64410 500364 16680 ...
## $ patient_nbr : num [1:101766] 8222157 55629189 86047875 82442376 42519267 ...
## $ race : chr [1:101766] "Caucasian" "Caucasian" "AfricanAmerican" "Caucasian" ...
## $ gender : chr [1:101766] "Female" "Female" "Female" "Male" ...
## $ age : chr [1:101766] "[0-10)" "[10-20)" "[20-30)" "[30-40)" ...
## $ weight : chr [1:101766] "?" "?" "?" "?" ...
## $ admission_type_id : num [1:101766] 6 1 1 1 1 2 3 1 2 3 ...
## $ discharge_disposition_id: num [1:101766] 25 1 1 1 1 1 1 1 1 3 ...
## $ admission_source_id : num [1:101766] 1 7 7 7 7 2 2 7 4 4 ...
## $ time_in_hospital : num [1:101766] 1 3 2 2 1 3 4 5 13 12 ...
## $ payer_code : chr [1:101766] "?" "?" "?" "?" ...
## $ medical_specialty : chr [1:101766] "Pediatrics-Endocrinology" "?" "?" "?" ...
## $ num_lab_procedures : num [1:101766] 41 59 11 44 51 31 70 73 68 33 ...
## $ num_procedures : num [1:101766] 0 0 5 1 0 6 1 0 2 3 ...
## $ num_medications : num [1:101766] 1 18 13 16 8 16 21 12 28 18 ...
## $ number_outpatient : num [1:101766] 0 0 2 0 0 0 0 0 0 0 ...
## $ number_emergency : num [1:101766] 0 0 0 0 0 0 0 0 0 0 ...
## $ number_inpatient : num [1:101766] 0 0 1 0 0 0 0 0 0 0 ...
## $ diag_1 : chr [1:101766] "250.83" "276" "648" "8" ...
## $ diag_2 : chr [1:101766] "?" "250.01" "250" "250.43" ...
## $ diag_3 : chr [1:101766] "?" "255" "V27" "403" ...
## $ number_diagnoses : num [1:101766] 1 9 6 7 5 9 7 8 8 8 ...
## $ max_glu_serum : chr [1:101766] "None" "None" "None" "None" ...
## $ A1Cresult : chr [1:101766] "None" "None" "None" "None" ...
## $ metformin : chr [1:101766] "No" "No" "No" "No" ...
## $ repaglinide : chr [1:101766] "No" "No" "No" "No" ...
## $ nateglinide : chr [1:101766] "No" "No" "No" "No" ...
## $ chlorpropamide : chr [1:101766] "No" "No" "No" "No" ...
## $ glimepiride : chr [1:101766] "No" "No" "No" "No" ...
## $ acetohexamide : chr [1:101766] "No" "No" "No" "No" ...
## $ glipizide : chr [1:101766] "No" "No" "Steady" "No" ...
## $ glyburide : chr [1:101766] "No" "No" "No" "No" ...
## $ tolbutamide : chr [1:101766] "No" "No" "No" "No" ...
## $ pioglitazone : chr [1:101766] "No" "No" "No" "No" ...
## $ rosiglitazone : chr [1:101766] "No" "No" "No" "No" ...
## $ acarbose : chr [1:101766] "No" "No" "No" "No" ...
## $ miglitol : chr [1:101766] "No" "No" "No" "No" ...
## $ troglitazone : chr [1:101766] "No" "No" "No" "No" ...
## $ tolazamide : chr [1:101766] "No" "No" "No" "No" ...
## $ examide : chr [1:101766] "No" "No" "No" "No" ...
## $ citoglipton : chr [1:101766] "No" "No" "No" "No" ...
## $ insulin : chr [1:101766] "No" "Up" "No" "Up" ...
## $ glyburide-metformin : chr [1:101766] "No" "No" "No" "No" ...
## $ glipizide-metformin : chr [1:101766] "No" "No" "No" "No" ...
## $ glimepiride-pioglitazone: chr [1:101766] "No" "No" "No" "No" ...
## $ metformin-rosiglitazone : chr [1:101766] "No" "No" "No" "No" ...
## $ metformin-pioglitazone : chr [1:101766] "No" "No" "No" "No" ...
## $ change : chr [1:101766] "No" "Ch" "No" "Ch" ...
## $ diabetesMed : chr [1:101766] "No" "Yes" "Yes" "Yes" ...
## $ readmitted : chr [1:101766] "NO" ">30" "NO" "NO" ...
## - attr(*, "spec")=
## .. cols(
## .. encounter_id = col_double(),
## .. patient_nbr = col_double(),
## .. race = col_character(),
## .. gender = col_character(),
## .. age = col_character(),
## .. weight = col_character(),
## .. admission_type_id = col_double(),
## .. discharge_disposition_id = col_double(),
## .. admission_source_id = col_double(),
## .. time_in_hospital = col_double(),
## .. payer_code = col_character(),
## .. medical_specialty = col_character(),
## .. num_lab_procedures = col_double(),
## .. num_procedures = col_double(),
## .. num_medications = col_double(),
## .. number_outpatient = col_double(),
## .. number_emergency = col_double(),
## .. number_inpatient = col_double(),
## .. diag_1 = col_character(),
## .. diag_2 = col_character(),
## .. diag_3 = col_character(),
## .. number_diagnoses = col_double(),
## .. max_glu_serum = col_character(),
## .. A1Cresult = col_character(),
## .. metformin = col_character(),
## .. repaglinide = col_character(),
## .. nateglinide = col_character(),
## .. chlorpropamide = col_character(),
## .. glimepiride = col_character(),
## .. acetohexamide = col_character(),
## .. glipizide = col_character(),
## .. glyburide = col_character(),
## .. tolbutamide = col_character(),
## .. pioglitazone = col_character(),
## .. rosiglitazone = col_character(),
## .. acarbose = col_character(),
## .. miglitol = col_character(),
## .. troglitazone = col_character(),
## .. tolazamide = col_character(),
## .. examide = col_character(),
## .. citoglipton = col_character(),
## .. insulin = col_character(),
## .. `glyburide-metformin` = col_character(),
## .. `glipizide-metformin` = col_character(),
## .. `glimepiride-pioglitazone` = col_character(),
## .. `metformin-rosiglitazone` = col_character(),
## .. `metformin-pioglitazone` = col_character(),
## .. change = col_character(),
## .. diabetesMed = col_character(),
## .. readmitted = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
Observing the dataset, there are many variables with “?” as a value which indicates a missing value. From a quick overview of the data, there seems to be an abundance of “?” in the weight and payer_code category. How much of the data consists of these “?” values? To begin the cleaning process, I replace all “?” values as NA to be able to have better management over these values. Once all of the “?” values become NA values, I grouped all features that consist of missing values and graph them to make out the distribution of missing data in the dataset.
# replacing "?" values as NA
diabetes <- read.csv("diabetic_data.csv", na.strings = c("?"))
# Grouped all missing values
missing.values <- diabetes %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num.missing = n()) %>%
filter(is.missing==T) %>%
select(-is.missing) %>%
arrange(desc(num.missing))
## `summarise()` has grouped output by 'key'. You can override using the `.groups` argument.
# Plotting missing values
missing.values %>%
ggplot() +
geom_bar(aes(x = key, y=num.missing), stat = 'identity') +
labs(x ='variable', y ="number of missing values", title = 'Number of missing values') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Upon examination, we can see that seven out of the 50 features consist of missing values. In addition, some variables such as weight consist of an extremely high number of missing values. This can heavily skew the data results therefore, we can consider a threshold of 40% of missing values in each variable to be considered to be removed from the dataset. Thus, variables weight, payer_code, and medical_specialty were removed from the dataset.
# Removing variables from the dataset
diabetes$weight = NULL
diabetes$payer_code = NULL
diabetes$medical_specialty = NULL
A second observation is that the dataset contains multiple rows of the same patient number (patient_nbr). It is unclear if the repeating patient numbers are independent from each other. There is a risk that these repeating variables can be dependent on each other, thus skewing the data results. To reduce the risk, I remove duplicate patients encounter and only take the first observation.
# Removing duplicate patient numbers
diabetes <- diabetes[!duplicated(diabetes$patient_nbr),]
Several variables were formatted the wrong way. Thus I change the data type of the variables from numeric to factor. Variables changed: admission_type_id, discharge_disposition_id, and admission_source_id.
# Change column type from numerical to categorical/nominal
diabetes$admission_type_id <- as.factor(diabetes$admission_type_id)
diabetes$discharge_disposition_id <- as.factor(diabetes$discharge_disposition_id)
diabetes$admission_source_id <- as.factor(diabetes$admission_source_id)
Within the EDA, certain variables were pointless or impractical towards the analysis. Thus, the removal of these variables were implemented. Varibles removed include: encounter_id, diag_2, diag_3, examide, and citoglipton.
# Delete columns "encounter_id"
diabetes$encounter_id = NULL
# Delete columns "diag_2", "diag_3", only use the primary diagnosis(diag_1)
diabetes$diag_2 = NULL
diabetes$diag_3 = NULL
# Delete "examide" and "citoglipton" only have 1 purposeless value
diabetes$examide = NULL
diabetes$citoglipton = NULL
There are still many missing values in the dataset. Keeping these values will result to biases, therefore the removal of all missing values.
diabetes <- na.omit(diabetes)
diabetes %>% head() %>% knitr::kable()
| patient_nbr | race | gender | age | admission_type_id | discharge_disposition_id | admission_source_id | time_in_hospital | num_lab_procedures | num_procedures | num_medications | number_outpatient | number_emergency | number_inpatient | diag_1 | number_diagnoses | max_glu_serum | A1Cresult | metformin | repaglinide | nateglinide | chlorpropamide | glimepiride | acetohexamide | glipizide | glyburide | tolbutamide | pioglitazone | rosiglitazone | acarbose | miglitol | troglitazone | tolazamide | insulin | glyburide.metformin | glipizide.metformin | glimepiride.pioglitazone | metformin.rosiglitazone | metformin.pioglitazone | change | diabetesMed | readmitted |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8222157 | Caucasian | Female | [0-10) | 6 | 25 | 1 | 1 | 41 | 0 | 1 | 0 | 0 | 0 | 250.83 | 1 | None | None | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | NO |
| 55629189 | Caucasian | Female | [10-20) | 1 | 1 | 7 | 3 | 59 | 0 | 18 | 0 | 0 | 0 | 276 | 9 | None | None | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | Up | No | No | No | No | No | Ch | Yes | >30 |
| 86047875 | AfricanAmerican | Female | [20-30) | 1 | 1 | 7 | 2 | 11 | 5 | 13 | 2 | 0 | 1 | 648 | 6 | None | None | No | No | No | No | No | No | Steady | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | Yes | NO |
| 82442376 | Caucasian | Male | [30-40) | 1 | 1 | 7 | 2 | 44 | 1 | 16 | 0 | 0 | 0 | 8 | 7 | None | None | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | Up | No | No | No | No | No | Ch | Yes | NO |
| 42519267 | Caucasian | Male | [40-50) | 1 | 1 | 7 | 1 | 51 | 0 | 8 | 0 | 0 | 0 | 197 | 5 | None | None | No | No | No | No | No | No | Steady | No | No | No | No | No | No | No | No | Steady | No | No | No | No | No | Ch | Yes | NO |
| 82637451 | Caucasian | Male | [50-60) | 2 | 1 | 2 | 3 | 31 | 6 | 16 | 0 | 0 | 0 | 414 | 9 | None | None | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | Steady | No | No | No | No | No | No | Yes | >30 |
The dataframe now consist of 69560 obs. of 42 variables:
# plotting race
p1 <- plot_ly(diabetes,x = ~race, color = ~race)
# plotting gender
p2 <- plot_ly(diabetes,x = ~gender, color = ~gender)
# plotting age
p3 <- plot_ly( diabetes,x = ~age, color = ~age)
# plotting demographic varibes together
subplot(p1, p2, p3) %>% hide_legend()
## No trace type specified:
## Based on info supplied, a 'histogram' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#histogram
## No trace type specified:
## Based on info supplied, a 'histogram' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#histogram
## No trace type specified:
## Based on info supplied, a 'histogram' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#histogram
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
From the demographic data, 76099 out of 99493 of our patients are Caucasian. Followed by that African American. The distribution of gender is nearly equal, but females are a little more prone to diabetes than males. The distribution of ages that were admitted to the hospital with diabetes exponentially increases with a 10 age interval until the age of 70-80. Which is the most common age interval that was admitted to the hospital was that of 70-80. Based on this we can understand that the elderly population is more prone to diabetes than the younger populations.
This data is not a true representation of the whole population of diabetes.
The feature description of each ID variables is reference from IDs_mapping.csv from the original dataset downloaded form UCI Machine Learning repository.
#collapsing some other variable
diabetes$admission_type <- replace(diabetes$admission_type,diabetes$admission_type == 2, 1)
diabetes$admission_type <- replace(diabetes$admission_type,diabetes$admission_type == 7, 1)
diabetes$admission_type <- replace(diabetes$admission_type,diabetes$admission_type == 6, 5)
diabetes$admission_type <- replace(diabetes$admission_type,diabetes$admission_type == 8, 5)
#change the name of variables
diabetes$admission_type <- str_replace(diabetes$admission_type,"1","Emergency")
diabetes$admission_type <- str_replace(diabetes$admission_type,"5","Other")
diabetes$admission_type <- str_replace(diabetes$admission_type,"3","Elective")
diabetes$admission_type <- str_replace(diabetes$admission_type,"4","Newborn")
# Plotting Admission Type
ggplot(diabetes, aes(admission_type)) +
geom_bar(fill = "#0073C2FF") +
labs(title = "Proportion of Admission Type",
x = "Admission Type",
y = "Count")
#collapsing some other variable and change the name of variables
diabetes$discharge_disposition <- case_when(diabetes$discharge_disposition %in% "1" ~ "Home",
TRUE ~ "Other")
# Plotting Discharge Disposition
ggplot(diabetes, aes(discharge_disposition)) +
geom_bar(fill = "#0073C2FF") +
labs(title = "Proportion of Discharge Disposition",
x = "Discharge Disposition",
y = "Count")
#collapsing some other variable and change the name of variables
diabetes$admission_source <- case_when(diabetes$admission_source %in% c("1","2","3") ~ "Physician Referral",
diabetes$admission_source %in% c("4","5","6","8","9","10","11","12","13","14","15","17","18","19","20","21","22","23","24","25","26") ~ "Other",
TRUE ~ "Emergency Room")
# Plotting Admission Source
ggplot(diabetes, aes(admission_source)) +
geom_bar(fill = "#0073C2FF") +
labs(title = "Proportion of Admission Source",
x = "Admission Source",
y = "Count")
Due to time constraint, limited knowledge in the medical industry, and the complexity of the data, further analysis of this variable will be void.
diag_1 contains too many levels and requires too many dummy variables.
# Plotting Boxplot
par(mfrow = c(2,4))
boxplot(diabetes$time_in_hospital, main = "time_in_hospital")
boxplot(diabetes$num_lab_procedures, main = "num_lab_procedures")
boxplot(diabetes$num_procedures, main = "num_procedures")
boxplot(diabetes$num_medications, main = "num_medications")
boxplot(diabetes$number_outpatient, main = "number_outpatient")
boxplot(diabetes$number_emergency, main = "number_emergency")
boxplot(diabetes$number_inpatient, main = "number_inpatient")
boxplot(diabetes$number_diagnoses, main = "number_diagnoses")
#remove the outliers
outliers_remover <- function(a){
df <- a
aa <- c()
count <- 1
for(i in 1:ncol(df)){
if(is.integer(df[,i])){
Q3 <- quantile(df[,i], 0.75, na.rm = TRUE)
Q1 <- quantile(df[,i], 0.25, na.rm = TRUE)
IQR <- Q3 - Q1 #IQR(df[,i])
upper <- Q3 + 1.5 * IQR
lower <- Q1 - 1.5 * IQR
for(j in 1:nrow(df)){
if(is.na(df[j,i]) == TRUE){
next
}
else if(df[j,i] > upper | df[j,i] < lower){
aa[count] <- j
count <- count+1
}
}
}
}
df <- df[-aa,]
}
diabetes <- outliers_remover(diabetes)
# Box plot without outliers
par(mfrow = c(2,4))
boxplot(diabetes$time_in_hospital, main = "time_in_hospital")
boxplot(diabetes$num_lab_procedures, main = "num_lab_procedures")
boxplot(diabetes$num_procedures, main = "num_procedures")
boxplot(diabetes$num_medications, main = "num_medications")
boxplot(diabetes$number_outpatient, main = "number_outpatient")
boxplot(diabetes$number_emergency, main = "number_emergency")
boxplot(diabetes$number_inpatient, main = "number_inpatient")
boxplot(diabetes$number_diagnoses, main = "number_diagnoses")
Upon examination, number of lab procedures and number of medications seems normally distributed without outlines. Time in hospital and number of procedures seem to be skewed to the right. Number of diagnoses seems skewed to the left. Number outpatient, number emergency, and number inpatient seem to consist of outlines.
Number outpatient, number emergency, and number inpatient is pointless to the data.
diabetes$number_outpatient = NULL
diabetes$number_emergency = NULL
diabetes$number_inpatient = NULL
# Plotting correlation heatmap
corr_diabetes <- select(diabetes, "time_in_hospital", "num_lab_procedures", "num_procedures", "num_medications", "number_diagnoses")
corrplot.mixed(cor(corr_diabetes),
lower = "ellipse",
upper = "number",
tl.pos = "lt",
diag = "l",
tl.col = "black")
Take an aerial view of the numerical features of our data by a correlation heat map. As you can see there are three correlations that catch our eye.
*Num medications vs time in hospital: Correlation of 0.47
*Num lab procedures vs time in hospital: Correlation of 0.32
*Num medication vs num lab procedures: Correlation of 0.39
Even though these correlation are quite low, these relationship have potential to predict end results