library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(caret)
## Loading required package: lattice
library(corrplot)
## corrplot 0.95 loaded
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(e1071)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(readr)
OUData <- read_csv("OUData.csv")
## Rows: 1111 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): ObservationRecordKey, Gender, PrimaryInsuranceCategory, InitPatien...
## dbl (5): Age, Flipped, OU_LOS_hrs, DRG01, BloodPressureLower
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
OUD<- OUData
summary(OUD)
## ObservationRecordKey Age Gender
## Length:1111 Min. :19.00 Length:1111
## Class :character 1st Qu.:57.00 Class :character
## Mode :character Median :73.00 Mode :character
## Mean :69.03
## 3rd Qu.:86.00
## Max. :89.00
## PrimaryInsuranceCategory InitPatientClassAndFirstPostOUClass Flipped
## Length:1111 Length:1111 Min. :0.0000
## Class :character Class :character 1st Qu.:0.0000
## Mode :character Mode :character Median :0.0000
## Mean :0.4608
## 3rd Qu.:1.0000
## Max. :1.0000
## OU_LOS_hrs DRG01 BloodPressureUpper BloodPressureLower
## Min. : 1.20 Min. :276.0 Length:1111 Min. : 0.00
## 1st Qu.: 24.40 1st Qu.:577.0 Class :character 1st Qu.: 65.00
## Median : 45.80 Median :780.0 Mode :character Median : 74.00
## Mean : 62.25 Mean :664.9 Mean : 75.27
## 3rd Qu.: 84.40 3rd Qu.:786.0 3rd Qu.: 85.00
## Max. :405.50 Max. :789.0 Max. :146.00
## BloodPressureDiff Pulse PulseOximetry Respirations
## Length:1111 Length:1111 Length:1111 Length:1111
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Temperature
## Length:1111
## Class :character
## Mode :character
##
##
##
str(OUD)
## spc_tbl_ [1,111 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ObservationRecordKey : chr [1:1111] "905459x1" "443621z2" "131565z1" "438080x1" ...
## $ Age : num [1:1111] 54 89 83 89 81 54 83 50 52 85 ...
## $ Gender : chr [1:1111] "Male" "Female" "Female" "Female" ...
## $ PrimaryInsuranceCategory : chr [1:1111] "MEDICAID STATE" "MEDICARE" "MEDICARE" "MEDICARE" ...
## $ InitPatientClassAndFirstPostOUClass: chr [1:1111] "OBSERVATION->INPATIENT" "OBSERVATION->OBSERVATION" "OBSERVATION->INPATIENT" "OBSERVATION->OBSERVATION" ...
## $ Flipped : num [1:1111] 1 0 1 0 0 0 1 1 0 1 ...
## $ OU_LOS_hrs : num [1:1111] 37.3 89.7 96.3 13.3 25.4 ...
## $ DRG01 : num [1:1111] 428 599 786 780 428 786 558 780 786 780 ...
## $ BloodPressureUpper : chr [1:1111] "153" "123" "105" "162" ...
## $ BloodPressureLower : num [1:1111] 111 64 55 73 60 56 61 48 85 60 ...
## $ BloodPressureDiff : chr [1:1111] "26" "68" "29" "83" ...
## $ Pulse : chr [1:1111] "73" "86" "81" "76" ...
## $ PulseOximetry : chr [1:1111] "100" "94" "94" "97" ...
## $ Respirations : chr [1:1111] "18" "18" "18" "24" ...
## $ Temperature : chr [1:1111] "98.2" "95.9" "97.3" "98.1" ...
## - attr(*, "spec")=
## .. cols(
## .. ObservationRecordKey = col_character(),
## .. Age = col_double(),
## .. Gender = col_character(),
## .. PrimaryInsuranceCategory = col_character(),
## .. InitPatientClassAndFirstPostOUClass = col_character(),
## .. Flipped = col_double(),
## .. OU_LOS_hrs = col_double(),
## .. DRG01 = col_double(),
## .. BloodPressureUpper = col_character(),
## .. BloodPressureLower = col_double(),
## .. BloodPressureDiff = col_character(),
## .. Pulse = col_character(),
## .. PulseOximetry = col_character(),
## .. Respirations = col_character(),
## .. Temperature = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
OUD <- OUD %>%
mutate(
across(c(BloodPressureUpper, BloodPressureDiff, Pulse, PulseOximetry, Respirations, Temperature),
~ as.numeric(gsub("[^0-9.]", "", .)))
)
OUD <- OUD %>%
mutate(
across(c(Gender, PrimaryInsuranceCategory, InitPatientClassAndFirstPostOUClass, Flipped, DRG01), as.factor),
across(c(BloodPressureUpper, BloodPressureLower, BloodPressureDiff, Pulse, PulseOximetry, Respirations, Temperature), as.numeric)
)
# Assuming OUD is your dataset
missing_values <- data.frame(Variable = names(OUD), Missing_Values = colSums(is.na(OUD)))
# Load the kableExtra package
library(kableExtra)
# Create a styled table with kableExtra
kable(missing_values, caption = "Number of Missing Values") %>%
kable_styling(bootstrap_options = "striped", full_width = F, position = "center") %>%
column_spec(1, width = "25em") %>%
column_spec(2, width = "10em") %>%
row_spec(0, bold = T, color = "white", background = "#2C3E50") %>%
row_spec(1:nrow(missing_values), background = "#ECF0F1")
| Variable | Missing_Values | |
|---|---|---|
| ObservationRecordKey | ObservationRecordKey | 0 |
| Age | Age | 0 |
| Gender | Gender | 0 |
| PrimaryInsuranceCategory | PrimaryInsuranceCategory | 0 |
| InitPatientClassAndFirstPostOUClass | InitPatientClassAndFirstPostOUClass | 0 |
| Flipped | Flipped | 0 |
| OU_LOS_hrs | OU_LOS_hrs | 0 |
| DRG01 | DRG01 | 0 |
| BloodPressureUpper | BloodPressureUpper | 3 |
| BloodPressureLower | BloodPressureLower | 0 |
| BloodPressureDiff | BloodPressureDiff | 4 |
| Pulse | Pulse | 4 |
| PulseOximetry | PulseOximetry | 1 |
| Respirations | Respirations | 4 |
| Temperature | Temperature | 4 |
# Histogram of Pulse Oximetry with units
hist_data <- OUD$PulseOximetry
hst <- hist(hist_data,
main = "Enhanced Histogram of Pulse Oximetry",
xlab = "Pulse Oximetry (%)", # Adding units for Pulse Oximetry
ylab = "Frequency",
col = rgb(0.2, 0.4, 0.8, 0.5),
border = "darkblue",
breaks = 20,
prob = TRUE)
# Adding density plot
lines(density(hist_data, na.rm = TRUE),
col = "red",
lwd = 2)
# Adding normal distribution curve
xfit <- seq(min(hist_data, na.rm = TRUE), max(hist_data, na.rm = TRUE), length = 100)
yfit <- dnorm(xfit, mean = mean(hist_data, na.rm = TRUE), sd = sd(hist_data, na.rm = TRUE))
lines(xfit, yfit, col = "darkgreen", lwd = 2, lty = 2)
# Adding grid and legend
grid(col = "gray", lty = "dotted")
legend("topright", legend = c("Density Plot", "Normal Curve"),
col = c("red", "darkgreen"), lwd = 2, lty = c(1, 2))
# Histogram of Respirations with units
hist_data2 <- OUD$Respirations
hst2 <- hist(hist_data2,
main = "Enhanced Histogram of Respirations",
xlab = "Respirations (breaths per minute)", # Adding units for Respirations
ylab = "Density",
col = rgb(0.2, 0.4, 0.8, 0.5),
border = "darkblue",
breaks = 20,
prob = TRUE)
# Adding density plot
lines(density(hist_data2, na.rm = TRUE),
col = "red",
lwd = 2)
# Adding normal distribution curve
xfit2 <- seq(min(hist_data2, na.rm = TRUE), max(hist_data2, na.rm = TRUE), length = 100)
yfit2 <- dnorm(xfit2, mean = mean(hist_data2, na.rm = TRUE), sd = sd(hist_data2, na.rm = TRUE))
lines(xfit2, yfit2, col = "darkgreen", lwd = 2, lty = 2)
# Adding grid and legend
grid(col = "gray", lty = "dotted")
legend("topright", legend = c("Density Plot", "Normal Curve"),
col = c("red", "darkgreen"), lwd = 2, lty = c(1, 2))
# Histogram of Blood Pressure (Upper) after scaling
hst3 <- hist(OUD$BloodPressureUpper,
main = "Enhanced Histogram of Blood Pressure (Upper)",
xlab = "Blood Pressure (Upper) (mmHg)", # Adding units for Blood Pressure (Upper)
ylab = "Density",
col = rgb(0.2, 0.4, 0.8, 0.5),
border = "blue",
breaks = 20,
prob = TRUE)
# Adding density plot
lines(density(OUD$BloodPressureUpper, na.rm = TRUE),
col = "red",
lwd = 2)
# Adding normal distribution curve
xfit3 <- seq(min(OUD$BloodPressureUpper, na.rm = TRUE), max(OUD$BloodPressureUpper, na.rm = TRUE), length = 100)
yfit3 <- dnorm(xfit3, mean = mean(OUD$BloodPressureUpper, na.rm = TRUE), sd = sd(OUD$BloodPressureUpper, na.rm = TRUE))
lines(xfit3, yfit3, col = "darkgreen", lwd = 2, lty = 2)
# Adding grid and legend
grid(col = "gray", lty = "dotted")
legend("topright", legend = c("Density Plot", "Normal Curve"),
col = c("red", "darkgreen"), lwd = 2, lty = c(1, 2))
# Histogram of Pulse Rate
hst4 <- hist(OUD$Pulse,
main = "Enhanced Histogram of Pulse Rate",
xlab = "Pulse (beats per minute)", # Adding units for Pulse
ylab = "Density",
col = rgb(0.2, 0.4, 0.8, 0.5),
border = "blue",
breaks = 20,
prob = TRUE)
# Adding density plot
lines(density(OUD$Pulse, na.rm = TRUE),
col = "red",
lwd = 2)
# Adding normal distribution curve
xfit4 <- seq(min(OUD$Pulse, na.rm = TRUE), max(OUD$Pulse, na.rm = TRUE), length = 100)
yfit4 <- dnorm(xfit4, mean = mean(OUD$Pulse, na.rm = TRUE), sd = sd(OUD$Pulse, na.rm = TRUE))
lines(xfit4, yfit4, col = "darkgreen", lwd = 2, lty = 2)
# Adding grid and legend
grid(col = "gray", lty = "dotted")
legend("topright", legend = c("Density Plot", "Normal Curve"),
col = c("red", "darkgreen"), lwd = 2, lty = c(1, 2))
# Histogram of Body Temperature
hst5 <- hist(OUD$Temperature,
main = "Enhanced Histogram of Body Temperature",
xlab = "Temperature (°C)", # Adding units for Temperature
ylab = "Density",
col = rgb(0.2, 0.4, 0.8, 0.5),
border = "blue",
breaks = 20,
prob = TRUE)
# Adding density plot
lines(density(OUD$Temperature, na.rm = TRUE),
col = "red",
lwd = 2)
# Adding normal distribution curve
xfit5 <- seq(min(OUD$Temperature, na.rm = TRUE), max(OUD$Temperature, na.rm = TRUE), length = 100)
yfit5 <- dnorm(xfit5, mean = mean(OUD$Temperature, na.rm = TRUE), sd = sd(OUD$Temperature, na.rm = TRUE))
lines(xfit5, yfit5, col = "darkgreen", lwd = 2, lty = 2)
# Adding grid and legend
grid(col = "gray", lty = "dotted")
legend("topright", legend = c("Density Plot", "Normal Curve"),
col = c("red", "darkgreen"), lwd = 2, lty = c(1, 2))
## NA values with Median As the variables are Skewed
library(dplyr)
OUD <- OUD %>%
mutate(across(c(PulseOximetry, Respirations, BloodPressureDiff, BloodPressureUpper, Pulse, Temperature),
~ ifelse(is.na(.), median(., na.rm = TRUE), .),
.names = "filled_{.col}")) # Adds '_filled' prefix for clarity
# Print the updated OUD dataset
print(OUD)
## # A tibble: 1,111 × 21
## ObservationRecordKey Age Gender PrimaryInsuranceCategory
## <chr> <dbl> <fct> <fct>
## 1 905459x1 54 Male MEDICAID STATE
## 2 443621z2 89 Female MEDICARE
## 3 131565z1 83 Female MEDICARE
## 4 438080x1 89 Female MEDICARE
## 5 763005z1 81 Female MEDICARE OTHER
## 6 244227z1 54 Female MEDICARE OTHER
## 7 448887x1 83 Female MEDICAID STATE
## 8 371392x1 50 Female MEDICARE
## 9 859289z1 52 Male Private
## 10 436418z1 85 Female MEDICARE OTHER
## # ℹ 1,101 more rows
## # ℹ 17 more variables: InitPatientClassAndFirstPostOUClass <fct>,
## # Flipped <fct>, OU_LOS_hrs <dbl>, DRG01 <fct>, BloodPressureUpper <dbl>,
## # BloodPressureLower <dbl>, BloodPressureDiff <dbl>, Pulse <dbl>,
## # PulseOximetry <dbl>, Respirations <dbl>, Temperature <dbl>,
## # filled_PulseOximetry <dbl>, filled_Respirations <dbl>,
## # filled_BloodPressureDiff <dbl>, filled_BloodPressureUpper <dbl>, …
# Checking missing values in all columns of OUD
missing_values <- colSums(is.na(OUD)) > 0
# Print the columns with missing values
print(missing_values)
## ObservationRecordKey Age
## FALSE FALSE
## Gender PrimaryInsuranceCategory
## FALSE FALSE
## InitPatientClassAndFirstPostOUClass Flipped
## FALSE FALSE
## OU_LOS_hrs DRG01
## FALSE FALSE
## BloodPressureUpper BloodPressureLower
## TRUE FALSE
## BloodPressureDiff Pulse
## TRUE TRUE
## PulseOximetry Respirations
## TRUE TRUE
## Temperature filled_PulseOximetry
## TRUE FALSE
## filled_Respirations filled_BloodPressureDiff
## FALSE FALSE
## filled_BloodPressureUpper filled_Pulse
## FALSE FALSE
## filled_Temperature
## FALSE
# Check the column names in OUD
print(colnames(OUD))
## [1] "ObservationRecordKey" "Age"
## [3] "Gender" "PrimaryInsuranceCategory"
## [5] "InitPatientClassAndFirstPostOUClass" "Flipped"
## [7] "OU_LOS_hrs" "DRG01"
## [9] "BloodPressureUpper" "BloodPressureLower"
## [11] "BloodPressureDiff" "Pulse"
## [13] "PulseOximetry" "Respirations"
## [15] "Temperature" "filled_PulseOximetry"
## [17] "filled_Respirations" "filled_BloodPressureDiff"
## [19] "filled_BloodPressureUpper" "filled_Pulse"
## [21] "filled_Temperature"
# Remove unuseful columns (if they exist)
OUD_clean <- OUD %>%
select(-one_of("ObservationRecordKey", "InitPatientClassAndFirstPostOUClass"))
library(ggplot2)
# Histogram of Age, colored by Flipped
ggplot(OUD, aes(x = Age, fill = Flipped)) +
geom_histogram(position = "identity", alpha = 0.7, bins = 25, color = "black", size = 0.3) + # Adjusted bin size and alpha
labs(title = "Age Distribution by Flipped Status",
subtitle = "Comparison of patients who flipped vs. those who did not",
x = "Age (years)",
y = "Frequency") +
scale_fill_manual(values = c("#FF6347", "#4682B4"), labels = c("Not Flipped", "Flipped")) + # Added color contrast
theme_minimal(base_size = 14) + # Changed base size for better readability
theme(plot.title = element_text(hjust = 0.5, face = "bold", color = "#333333", size = 20),
plot.subtitle = element_text(hjust = 0.5, color = "#555555", size = 14),
axis.title = element_text(face = "bold", color = "#444444"),
axis.text = element_text(color = "#666666"),
legend.title = element_blank(),
legend.text = element_text(color = "#444444"),
panel.grid.major = element_line(color = "#dddddd", linetype = "dotted"),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "#f9f9f9", color = "#f9f9f9"),
plot.background = element_rect(fill = "#f9f9f9", color = "#f9f9f9")) +
annotate("text", x = 60, y = 60, label = "Flipped patients tend to be younger", size = 5, color = "#4682B4", fontface = "italic")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(ggplot2)
library(dplyr)
# Calculate the proportion of patients flipped to inpatient status by DRG code
proportion_data <- OUD %>%
group_by(DRG01) %>%
summarise(Proportion_Inpatient = mean(Flipped == 1, na.rm = TRUE))
# Sort the data by proportion in descending order to highlight the highest
proportion_data <- proportion_data %>%
arrange(desc(Proportion_Inpatient))
# Reorder DRG01 factor levels based on Proportion_Inpatient
proportion_data$DRG01 <- factor(proportion_data$DRG01, levels = proportion_data$DRG01)
# Create the enhanced bar chart
ggplot(proportion_data, aes(x = DRG01, y = Proportion_Inpatient, fill = Proportion_Inpatient)) +
geom_bar(stat = "identity", color = "white", show.legend = FALSE) +
scale_fill_gradient(low = "#FF6347", high = "#4682B4") + # Color gradient based on proportion
labs(title = "Proportion of Patients Flipped to Inpatient Status by DRG Code",
subtitle = "Higher proportions of flipped patients are shown in darker blue",
x = "DRG Code",
y = "Proportion of Inpatients") +
theme_minimal(base_size = 14) +
theme(
axis.title.x = element_text(face = "bold", size = 14, color = "#333333"),
axis.title.y = element_text(face = "bold", size = 14, color = "#333333"),
axis.text.x = element_text(angle = 45, hjust = 1, color = "#444444"), # Angled x-axis labels for better readability
plot.title = element_text(face = "bold", size = 16, color = "#333333", hjust = 0.5),
plot.subtitle = element_text(size = 12, color = "#555555", hjust = 0.5),
panel.grid.major = element_line(color = "#eeeeee", linetype = "dotted"),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "#f9f9f9", color = "#f9f9f9"),
plot.background = element_rect(fill = "#f9f9f9", color = "#f9f9f9")
) +
geom_text(aes(label = scales::percent(Proportion_Inpatient, accuracy = 0.1)), vjust = -0.5, color = "#333333", fontface = "bold")
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
library(ggbeeswarm)
# Define RGB colors for orange and green
orange_rgb <- rgb(255, 165, 0, maxColorValue = 255) # Orange
green_rgb <- rgb(0, 128, 0, maxColorValue = 255) # Green
# Pulse Oximetry and Flipping Rate (Density Plot)
p1 <- ggplot(OUD_clean, aes(x = PulseOximetry, fill = factor(Flipped))) +
geom_density(alpha = 0.6) +
scale_fill_manual(values = c(orange_rgb, green_rgb)) +
labs(title = "Pulse Oximetry Distribution by Flipping Rate",
x = "Pulse Oximetry", y = "Density") +
theme_minimal(base_size = 8) +
theme(
axis.title.x = element_text(face = "bold", size = 14, color = "#333333"),
axis.title.y = element_text(face = "bold", size = 14, color = "#333333"),
plot.title = element_text(face = "bold", size = 18, color = "#333333", hjust = 0.5),
axis.text.x = element_text(color = "#444444", size = 12),
axis.text.y = element_text(color = "#444444", size = 12),
panel.grid.major = element_line(color = "#eeeeee", linetype = "dotted", linewidth = 0.8),
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "#f9f9f9"),
panel.background = element_rect(fill = "#ffffff")
)
plot(p1)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).
# Length of Stay and Flipping Rate (Density Plot)
p2 <- ggplot(OUD_clean, aes(x = OU_LOS_hrs, fill = factor(Flipped))) +
geom_density(alpha = 0.6) +
scale_fill_manual(values = c(orange_rgb, green_rgb)) +
labs(title = "Length of Stay Distribution by Flipping Rate",
x = "Length of Stay (hrs)", y = "Density") +
theme_minimal(base_size = 8) +
theme(
axis.title.x = element_text(face = "bold", size = 14, color = "#333333"),
axis.title.y = element_text(face = "bold", size = 14, color = "#333333"),
plot.title = element_text(face = "bold", size = 18, color = "#333333", hjust = 0.5),
axis.text.x = element_text(color = "#444444", size = 12),
axis.text.y = element_text(color = "#444444", size = 12),
panel.grid.major = element_line(color = "#eeeeee", linetype = "dotted", linewidth = 0.8),
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "#f9f9f9"),
panel.background = element_rect(fill = "#ffffff")
)
plot(p2)
# Blood Pressure Difference and Flipping Rate (Violin Plot)
p3 <- ggplot(OUD_clean, aes(x = factor(Flipped), y = BloodPressureDiff, fill = factor(Flipped))) +
geom_violin(trim = FALSE, color = "black") +
scale_fill_manual(values = c(orange_rgb, green_rgb)) +
labs(title = "Blood Pressure Difference vs Flipping Rate",
x = "Inpatient Status", y = "Blood Pressure Difference (mmHg)") +
theme_minimal(base_size = 8) +
theme(
axis.title.x = element_text(face = "bold", size = 14, color = "#333333"),
axis.title.y = element_text(face = "bold", size = 14, color = "#333333"),
plot.title = element_text(face = "bold", size = 18, color = "#333333", hjust = 0.5),
axis.text.x = element_text(color = "#444444", size = 12),
axis.text.y = element_text(color = "#444444", size = 12),
panel.grid.major = element_line(color = "#eeeeee", linetype = "dotted", linewidth = 0.8),
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "#f9f9f9"),
panel.background = element_rect(fill = "#ffffff")
)
plot(p3)
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
# Temperature and Flipping Rate (Dot Plot / Jitter Plot)
p4 <- ggplot(OUD_clean, aes(x = factor(Flipped), y = Temperature, color = factor(Flipped))) +
geom_jitter(width = 0.1, size = 2, alpha = 0.6) +
scale_color_manual(values = c(orange_rgb, green_rgb)) +
labs(title = "Temperature vs Flipping Rate",
x = "Inpatient Status", y = "Temperature (°C)") +
theme_minimal(base_size = 8) +
theme(
axis.title.x = element_text(face = "bold", size = 14, color = "#333333"),
axis.title.y = element_text(face = "bold", size = 14, color = "#333333"),
plot.title = element_text(face = "bold", size = 18, color = "#333333", hjust = 0.5),
axis.text.x = element_text(color = "#444444", size = 12),
axis.text.y = element_text(color = "#444444", size = 12),
panel.grid.major = element_line(color = "#eeeeee", linetype = "dotted", linewidth = 0.8),
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "#f9f9f9"),
panel.background = element_rect(fill = "#ffffff")
)
plot(p4)
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Blood Pressure Difference and Flipping Rate (Bee Swarm Plot)
p5 <- ggplot(OUD_clean, aes(x = factor(Flipped), y = BloodPressureDiff, color = factor(Flipped))) +
geom_beeswarm(size = 2, alpha = 0.6) +
scale_color_manual(values = c(orange_rgb, green_rgb)) +
labs(title = "Blood Pressure Difference vs Flipping Rate",
x = "Inpatient Status", y = "Blood Pressure Difference (mmHg)") +
theme_minimal(base_size = 8) +
theme(
axis.title.x = element_text(face = "bold", size = 14, color = "#333333"),
axis.title.y = element_text(face = "bold", size = 14, color = "#333333"),
plot.title = element_text(face = "bold", size = 18, color = "#333333", hjust = 0.5),
axis.text.x = element_text(color = "#444444", size = 12),
axis.text.y = element_text(color = "#444444", size = 12),
panel.grid.major = element_line(color = "#eeeeee", linetype = "dotted", linewidth = 0.8),
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "#f9f9f9"),
panel.background = element_rect(fill = "#ffffff")
)
plot(p5)
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
flipped_counts <- OUD %>%
group_by(PrimaryInsuranceCategory, Flipped) %>%
summarise(Count = n(), .groups = "drop") %>%
mutate(Flipped = factor(Flipped, labels = c("Not Flipped", "Flipped"))) # Convert to factor with labels
ggplot(flipped_counts, aes(x = PrimaryInsuranceCategory, y = Count, fill = Flipped)) +
geom_bar(stat = "identity", position = "stack", color = "white", size = 0.5) +
scale_fill_manual(values = c('#4B0082', '#008080'), # Dark purple and teal for bars
name = "Flipped Status") +
scale_y_continuous(labels = scales::comma) + # Format y-axis with commas for better readability
labs(x = "Primary Insurance Category", y = "Patient Count",
title = "Flipped Status Counts by Primary Insurance Category") +
theme_minimal(base_size = 5) +
theme(
axis.title.x = element_text(face = "italic", size = 16, color = "#FFFFFF", family = "Helvetica"),
axis.title.y = element_text(face = "italic", size = 16, color = "#FFFFFF", family = "Helvetica"),
plot.title = element_text(face = "bold", size = 20, color = "#FFFFFF", hjust = 0.5, family = "Georgia"),
axis.text.x = element_text(angle = 45, hjust = 1, size = 14, color = "#E0E0E0", family = "Arial"),
axis.text.y = element_text(size = 14, color = "#E0E0E0", family = "Arial"),
panel.grid.major = element_line(color = "#444444", linetype = "dotted", linewidth = 1), # Darker grid lines
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "#212121"), # Dark background
panel.background = element_rect(fill = "#2C2C2C"), # Dark panel background
legend.title = element_text(size = 14, face = "bold", color = "#FFFFFF"),
legend.text = element_text(size = 12, color = "#E0E0E0")
) +
geom_text(aes(label = scales::comma(Count)),
position = position_stack(vjust = 0.5),
size = 5,
color = "#FFFFFF",
fontface = "bold",
family = "Arial") + # Adding text labels inside bars for count visualization
theme(legend.position = "top", # Move the legend to the top
legend.background = element_rect(fill = "#2C2C2C", color = "black", size = 0.5), # Dark legend background
legend.key = element_rect(fill = "#2C2C2C", color = "black")) # Dark legend key
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
##Convert the Flipped variable into a factor to ensure it is recognized
as a categorical variable.
library(dplyr)
library(ggplot2)
library(scales) # For formatting numbers with commas
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
# Convert Flipped variable to factor to ensure it's treated as categorical
OUD$Flipped <- factor(OUD$Flipped, levels = c(0, 1), labels = c("Not Flipped", "Flipped"))
# Calculate percentage of flipped patients within each primary insurance category
flipped_percentages <- OUD %>%
group_by(PrimaryInsuranceCategory, Flipped) %>%
summarise(Count = n(), .groups = "drop") %>%
mutate(Percentage = (Count / sum(Count)) * 100)
# Plot bar chart with percentages
ggplot(flipped_percentages, aes(x = PrimaryInsuranceCategory, y = Percentage, fill = Flipped)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.8), color = "white", size = 1) +
geom_text(aes(label = paste0(round(Percentage, 2), "%")),
position = position_dodge(width = 0.8),
vjust = -0.5,
size = 4,
fontface = "bold",
color = "white") + # White text labels for better contrast
scale_fill_manual(values = c('#1E1E1E', '#5F9EA0'), # Dark gray and teal for a sleek look
labels = c('Not Flipped', 'Flipped'),
name = "Flipped Status") +
scale_y_continuous(labels = scales::comma) + # Format y-axis with commas for better readability
labs(x = "Primary Insurance Category",
y = "Percentage of Flipped Patients",
title = "Percentage of Flipped Patients by Primary Insurance Category") +
theme_minimal(base_size = 16) +
theme(
plot.background = element_rect(fill = "#212121"), # Dark background for the entire plot
panel.background = element_rect(fill = "#2C2C2C"), # Dark background for the panel area
axis.title.x = element_text(face = "bold", size = 14, color = "#FFFFFF", family = "Helvetica"),
axis.title.y = element_text(face = "bold", size = 14, color = "#FFFFFF", family = "Helvetica"),
plot.title = element_text(face = "bold", size = 16, color = "#FFFFFF", hjust = 0.5, family = "Georgia"),
axis.text.x = element_text(angle = 45, hjust = 1, size = 12, color = "#E0E0E0", family = "Arial"),
axis.text.y = element_text(size = 12, color = "#E0E0E0", family = "Arial"),
panel.grid.major = element_line(color = "#444444", linetype = "dotted", linewidth = 1), # Dark grid lines
panel.grid.minor = element_blank(),
legend.title = element_text(size = 14, face = "bold", color = "#FFFFFF"),
legend.text = element_text(size = 12, color = "#E0E0E0"),
legend.background = element_rect(fill = "#2C2C2C", color = "black", size = 0.5), # Dark legend background
legend.key = element_rect(fill = "#2C2C2C", color = "black") # Dark legend key
)
## Plot for Pulse Oximetry vs Length of Stay
OUD_clean <- OUD %>% filter(!is.na(OU_LOS_hrs) & !is.na(Age))
ggplot(OUD_clean, aes(x = OU_LOS_hrs, y = Age)) +
geom_point(aes(color = as.factor(Flipped)), size = 3, alpha = 0.7) +
scale_color_manual(values = c("#90EE90", "#00008B"),
labels = c("Not Flipped", "Flipped"),
name = "Flipped Status") +
geom_smooth(method = "lm", color = "#FFA500", se = FALSE, linetype = "dashed", size = 1) +
labs(
x = "Length of Stay in Observation Unit (Hours)",
y = "Patient Age (Years)",
title = "Age vs Length of Stay in Observation Unit"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
library(ggplot2)
library(dplyr)
# Create a frequency table for heatmap
heatmap_data <- OUD %>%
group_by(Gender, Flipped) %>%
summarise(Count = n(), .groups = "drop")
# Generate a heatmap
ggplot(heatmap_data, aes(x = Gender, y = as.factor(Flipped), fill = Count)) +
geom_tile(color = "black", size = 0.8) + # Add black borders for clarity
geom_text(aes(label = Count), size = 6, color = "white") + # Show count inside cells
scale_fill_gradient(low = "#90EE90", high = "#00008B") +
labs(x = "Gender", y = "Flipped Status", fill = "Count",
title = "Flipped Status Heatmap by Gender",
subtitle = "Intensity Based on Patient Counts") +
theme_minimal(base_size = 16) +
theme(
plot.title = element_text(face = "bold", size = 18, hjust = 0.5),
plot.subtitle = element_text(face = "italic", size = 14, hjust = 0.5),
axis.title.x = element_text(face = "bold", size = 14),
axis.title.y = element_text(face = "bold", size = 14),
legend.title = element_text(face = "bold"),
legend.background = element_rect(fill = "white", color = "black"),
legend.text = element_text(size = 12)
)
library(corrplot)
# Select numerical variables
numerical_vars <- OUD[, sapply(OUD, is.numeric)]
# Calculate the correlation matrix
correlation_matrix <- cor(numerical_vars, use = "complete.obs")
# Define a custom diverging color palette
col <- colorRampPalette(c("#6D9EC1", "white", "#E46726"))(200)
# Increase the size of the plotting device (for macOS)
quartz(width = 10, height = 20) # Set plot size to 12x10 inches
# Adjust margins for better spacing
par(mar = c(2, 2, 3, 2))
corrplot(correlation_matrix,
method = "color",
type = "upper",
addCoef.col = "black",
tl.col = "black",
tl.srt = 45,
number.cex = 0.5, # Smaller correlation coefficients
tl.cex = 0.7, # Smaller variable labels
diag = FALSE,
col = col,
tl.offset = 0.5,
cl.pos = "r",
cl.ratio = 0.2,
cl.length = 5,
cl.align.text = "c",
cl.cex = 0.8,
addgrid.col = "gray90",
mar = c(0, 0, 1, 0)
)
## Dependent variable “FLipped”
# Load necessary libraries
library(ggplot2)
library(dplyr)
# Frequency and proportion table
tbl <- table(OUD$Flipped)
prop_tbl <- round(prop.table(tbl), 3) # More precision
# Convert to dataframe for plotting
df <- as.data.frame(tbl)
colnames(df) <- c("Flipped", "Count")
df$Proportion <- prop_tbl[df$Flipped] # Add proportion column
# Create the bar chart with proportion labels
ggplot(df, aes(x = as.factor(Flipped), y = Count, fill = as.factor(Flipped))) +
geom_bar(stat = "identity", width = 0.5, show.legend = FALSE) +
geom_text(aes(label = paste0(Count, " (", Proportion * 100, "%)")), vjust = -0.5, size = 5, fontface = "bold") +
scale_fill_manual(values = c("navy", "darkred"), labels = c("Stayed in Observation", "Flipped to Inpatient")) +
scale_y_continuous(sec.axis = sec_axis(~ . / sum(tbl), name = "Proportion")) +
labs(title = "Proportion of Non-Flipped and Flipped Patients",
subtitle = paste0("Total Patients: ", sum(tbl)),
x = "Patient Status",
y = "Count",
caption = "Proportions displayed as percentages") +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 16),
plot.subtitle = element_text(hjust = 0.5, size = 12),
axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank()
)
## Data Partion
# Load necessary libraries
library(caret)
# Set seed for reproducibility
set.seed(123)
# Convert 'Flipped' column to binary (0,1)
OUD$Flipped <- ifelse(OUD$Flipped == "Flipped", 1, 0)
# Generate random indexes for training (70%) and testing (30%) sets
train.index <- sample(nrow(OUD), 0.7 * nrow(OUD))
test.index <- setdiff(seq_len(nrow(OUD)), train.index)
# Split the data
train_data <- OUD[train.index, ]
test_data <- OUD[test.index, ]
# Convert target variable to factor
train_data$Flipped <- as.factor(train_data$Flipped)
test_data$Flipped <- as.factor(test_data$Flipped)
# Function to replace NA values with median for numeric columns
replace_na_with_median <- function(df) {
numeric_cols <- sapply(df, is.numeric) # Identify numeric columns
df[numeric_cols] <- lapply(df[numeric_cols], function(x) {
x[is.na(x)] <- median(x, na.rm = TRUE) # Replace NA with median
return(x)
})
return(df)
}
# Apply the function to training and testing datasets
train_data <- replace_na_with_median(train_data)
test_data <- replace_na_with_median(test_data)
# Logistic Regression Model
logistic_model <- glm(Flipped ~ Age + Gender + PrimaryInsuranceCategory + OU_LOS_hrs +
DRG01 + BloodPressureUpper + BloodPressureLower +
BloodPressureDiff + Pulse + PulseOximetry + Respirations +
Temperature,
data = train_data, family = "binomial",
control = glm.control(maxit = 500))
# Check for convergence issues
summary(logistic_model)
##
## Call:
## glm(formula = Flipped ~ Age + Gender + PrimaryInsuranceCategory +
## OU_LOS_hrs + DRG01 + BloodPressureUpper + BloodPressureLower +
## BloodPressureDiff + Pulse + PulseOximetry + Respirations +
## Temperature, family = "binomial", data = train_data, control = glm.control(maxit = 500))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.770e+00 9.011e+00 -1.084 0.278254
## Age -1.598e-02 7.255e-03 -2.203 0.027616
## GenderMale 4.078e-02 1.880e-01 0.217 0.828322
## PrimaryInsuranceCategoryMEDICAID STATE 5.253e-02 4.730e-01 0.111 0.911570
## PrimaryInsuranceCategoryMEDICARE 9.869e-01 3.874e-01 2.547 0.010861
## PrimaryInsuranceCategoryMEDICARE OTHER -3.222e-02 3.981e-01 -0.081 0.935497
## PrimaryInsuranceCategoryPrivate 1.226e-01 3.627e-01 0.338 0.735365
## OU_LOS_hrs 3.034e-02 2.806e-03 10.814 < 2e-16
## DRG01428 3.181e-01 4.903e-01 0.649 0.516505
## DRG01486 -1.861e-01 4.572e-01 -0.407 0.683918
## DRG01558 5.071e-01 5.890e-01 0.861 0.389285
## DRG01577 8.257e-01 7.681e-01 1.075 0.282416
## DRG01578 -3.596e-01 5.259e-01 -0.684 0.494041
## DRG01599 -4.411e-01 4.261e-01 -1.035 0.300565
## DRG01780 -1.590e+00 3.275e-01 -4.853 1.21e-06
## DRG01782 -5.855e-01 7.420e-01 -0.789 0.430104
## DRG01786 -1.287e+00 3.680e-01 -3.498 0.000469
## DRG01787 -9.231e-01 4.143e-01 -2.228 0.025867
## DRG01789 -1.004e+00 3.624e-01 -2.769 0.005618
## BloodPressureUpper 3.103e-03 4.396e-03 0.706 0.480225
## BloodPressureLower -9.512e-03 7.293e-03 -1.304 0.192153
## BloodPressureDiff 1.479e-03 4.460e-03 0.332 0.740105
## Pulse 4.220e-03 5.749e-03 0.734 0.463004
## PulseOximetry 2.037e-06 3.436e-02 0.000 0.999953
## Respirations -4.794e-03 2.147e-02 -0.223 0.823319
## Temperature 9.569e-02 8.605e-02 1.112 0.266116
##
## (Intercept)
## Age *
## GenderMale
## PrimaryInsuranceCategoryMEDICAID STATE
## PrimaryInsuranceCategoryMEDICARE *
## PrimaryInsuranceCategoryMEDICARE OTHER
## PrimaryInsuranceCategoryPrivate
## OU_LOS_hrs ***
## DRG01428
## DRG01486
## DRG01558
## DRG01577
## DRG01578
## DRG01599
## DRG01780 ***
## DRG01782
## DRG01786 ***
## DRG01787 *
## DRG01789 **
## BloodPressureUpper
## BloodPressureLower
## BloodPressureDiff
## Pulse
## PulseOximetry
## Respirations
## Temperature
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1072.36 on 776 degrees of freedom
## Residual deviance: 785.04 on 751 degrees of freedom
## AIC: 837.04
##
## Number of Fisher Scoring iterations: 5
# Stepwise Regression for feature selection
step_model <- step(logistic_model, direction = "both")
## Start: AIC=837.04
## Flipped ~ Age + Gender + PrimaryInsuranceCategory + OU_LOS_hrs +
## DRG01 + BloodPressureUpper + BloodPressureLower + BloodPressureDiff +
## Pulse + PulseOximetry + Respirations + Temperature
##
## Df Deviance AIC
## - PulseOximetry 1 785.04 835.04
## - Gender 1 785.09 835.09
## - Respirations 1 785.09 835.09
## - BloodPressureDiff 1 785.15 835.15
## - BloodPressureUpper 1 785.54 835.54
## - Pulse 1 785.58 835.58
## - Temperature 1 786.28 836.28
## - BloodPressureLower 1 786.74 836.74
## <none> 785.04 837.04
## - Age 1 789.95 839.95
## - PrimaryInsuranceCategory 4 810.35 854.35
## - DRG01 11 846.04 876.04
## - OU_LOS_hrs 1 979.05 1029.05
##
## Step: AIC=835.04
## Flipped ~ Age + Gender + PrimaryInsuranceCategory + OU_LOS_hrs +
## DRG01 + BloodPressureUpper + BloodPressureLower + BloodPressureDiff +
## Pulse + Respirations + Temperature
##
## Df Deviance AIC
## - Gender 1 785.09 833.09
## - Respirations 1 785.09 833.09
## - BloodPressureDiff 1 785.15 833.15
## - BloodPressureUpper 1 785.54 833.54
## - Pulse 1 785.59 833.59
## - Temperature 1 786.28 834.28
## - BloodPressureLower 1 786.77 834.77
## <none> 785.04 835.04
## + PulseOximetry 1 785.04 837.04
## - Age 1 790.01 838.01
## - PrimaryInsuranceCategory 4 810.40 852.40
## - DRG01 11 846.12 874.12
## - OU_LOS_hrs 1 979.06 1027.06
##
## Step: AIC=833.09
## Flipped ~ Age + PrimaryInsuranceCategory + OU_LOS_hrs + DRG01 +
## BloodPressureUpper + BloodPressureLower + BloodPressureDiff +
## Pulse + Respirations + Temperature
##
## Df Deviance AIC
## - Respirations 1 785.15 831.15
## - BloodPressureDiff 1 785.20 831.20
## - BloodPressureUpper 1 785.57 831.57
## - Pulse 1 785.62 831.62
## - Temperature 1 786.38 832.38
## - BloodPressureLower 1 786.78 832.78
## <none> 785.09 833.09
## + Gender 1 785.04 835.04
## + PulseOximetry 1 785.09 835.09
## - Age 1 790.15 836.15
## - PrimaryInsuranceCategory 4 810.48 850.48
## - DRG01 11 846.13 872.13
## - OU_LOS_hrs 1 979.73 1025.73
##
## Step: AIC=831.15
## Flipped ~ Age + PrimaryInsuranceCategory + OU_LOS_hrs + DRG01 +
## BloodPressureUpper + BloodPressureLower + BloodPressureDiff +
## Pulse + Temperature
##
## Df Deviance AIC
## - BloodPressureDiff 1 785.27 829.27
## - BloodPressureUpper 1 785.60 829.60
## - Pulse 1 785.64 829.64
## - Temperature 1 786.47 830.47
## - BloodPressureLower 1 786.88 830.88
## <none> 785.15 831.15
## + Respirations 1 785.09 833.09
## + Gender 1 785.09 833.09
## + PulseOximetry 1 785.15 833.15
## - Age 1 790.22 834.22
## - PrimaryInsuranceCategory 4 810.52 848.52
## - DRG01 11 846.13 870.13
## - OU_LOS_hrs 1 979.94 1023.94
##
## Step: AIC=829.27
## Flipped ~ Age + PrimaryInsuranceCategory + OU_LOS_hrs + DRG01 +
## BloodPressureUpper + BloodPressureLower + Pulse + Temperature
##
## Df Deviance AIC
## - BloodPressureUpper 1 785.69 827.69
## - Pulse 1 785.80 827.80
## - Temperature 1 786.57 828.57
## - BloodPressureLower 1 786.96 828.96
## <none> 785.27 829.27
## + BloodPressureDiff 1 785.15 831.15
## + Respirations 1 785.20 831.20
## + Gender 1 785.21 831.21
## + PulseOximetry 1 785.27 831.27
## - Age 1 790.43 832.43
## - PrimaryInsuranceCategory 4 810.66 846.66
## - DRG01 11 846.30 868.30
## - OU_LOS_hrs 1 980.05 1022.05
##
## Step: AIC=827.69
## Flipped ~ Age + PrimaryInsuranceCategory + OU_LOS_hrs + DRG01 +
## BloodPressureLower + Pulse + Temperature
##
## Df Deviance AIC
## - Pulse 1 786.14 826.14
## - BloodPressureLower 1 786.97 826.97
## - Temperature 1 787.08 827.08
## <none> 785.69 827.69
## + BloodPressureUpper 1 785.27 829.27
## + BloodPressureDiff 1 785.60 829.60
## + Respirations 1 785.66 829.66
## + Gender 1 785.66 829.66
## + PulseOximetry 1 785.69 829.69
## - Age 1 790.61 830.61
## - PrimaryInsuranceCategory 4 811.13 845.13
## - DRG01 11 846.30 866.30
## - OU_LOS_hrs 1 980.17 1020.17
##
## Step: AIC=826.14
## Flipped ~ Age + PrimaryInsuranceCategory + OU_LOS_hrs + DRG01 +
## BloodPressureLower + Temperature
##
## Df Deviance AIC
## - BloodPressureLower 1 787.19 825.19
## - Temperature 1 787.82 825.82
## <none> 786.14 826.14
## + Pulse 1 785.69 827.69
## + BloodPressureUpper 1 785.80 827.80
## + BloodPressureDiff 1 786.02 828.02
## + Gender 1 786.12 828.12
## + PulseOximetry 1 786.13 828.13
## + Respirations 1 786.13 828.13
## - Age 1 791.41 829.41
## - PrimaryInsuranceCategory 4 811.67 843.67
## - DRG01 11 847.12 865.12
## - OU_LOS_hrs 1 980.17 1018.17
##
## Step: AIC=825.19
## Flipped ~ Age + PrimaryInsuranceCategory + OU_LOS_hrs + DRG01 +
## Temperature
##
## Df Deviance AIC
## - Temperature 1 788.83 824.83
## <none> 787.19 825.19
## + BloodPressureLower 1 786.14 826.14
## + Pulse 1 786.97 826.97
## + BloodPressureDiff 1 787.08 827.08
## + Respirations 1 787.12 827.12
## + PulseOximetry 1 787.15 827.15
## + Gender 1 787.18 827.18
## + BloodPressureUpper 1 787.19 827.19
## - Age 1 792.38 828.38
## - PrimaryInsuranceCategory 4 812.39 842.39
## - DRG01 11 849.12 865.12
## - OU_LOS_hrs 1 980.98 1016.98
##
## Step: AIC=824.83
## Flipped ~ Age + PrimaryInsuranceCategory + OU_LOS_hrs + DRG01
##
## Df Deviance AIC
## <none> 788.83 824.83
## + Temperature 1 787.19 825.19
## + BloodPressureLower 1 787.82 825.82
## + Pulse 1 788.39 826.39
## + BloodPressureDiff 1 788.74 826.74
## + Respirations 1 788.75 826.75
## + Gender 1 788.78 826.78
## + PulseOximetry 1 788.78 826.78
## + BloodPressureUpper 1 788.83 826.83
## - Age 1 793.95 827.95
## - PrimaryInsuranceCategory 4 814.02 842.02
## - DRG01 11 850.36 864.36
## - OU_LOS_hrs 1 982.86 1016.86
summary(step_model)
##
## Call:
## glm(formula = Flipped ~ Age + PrimaryInsuranceCategory + OU_LOS_hrs +
## DRG01, family = "binomial", data = train_data, control = glm.control(maxit = 500))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.309862 0.519741 -0.596 0.551052
## Age -0.015938 0.007087 -2.249 0.024519
## PrimaryInsuranceCategoryMEDICAID STATE 0.075414 0.465948 0.162 0.871423
## PrimaryInsuranceCategoryMEDICARE 0.980352 0.385009 2.546 0.010887
## PrimaryInsuranceCategoryMEDICARE OTHER -0.028396 0.395753 -0.072 0.942799
## PrimaryInsuranceCategoryPrivate 0.118560 0.360942 0.328 0.742553
## OU_LOS_hrs 0.030243 0.002792 10.832 < 2e-16
## DRG01428 0.276947 0.480179 0.577 0.564103
## DRG01486 -0.127735 0.449622 -0.284 0.776338
## DRG01558 0.504422 0.586525 0.860 0.389780
## DRG01577 0.747287 0.767393 0.974 0.330156
## DRG01578 -0.302138 0.520587 -0.580 0.561659
## DRG01599 -0.390473 0.420089 -0.930 0.352629
## DRG01780 -1.570000 0.322929 -4.862 1.16e-06
## DRG01782 -0.650213 0.734101 -0.886 0.375764
## DRG01786 -1.299789 0.362122 -3.589 0.000331
## DRG01787 -0.929946 0.411921 -2.258 0.023972
## DRG01789 -1.035670 0.360072 -2.876 0.004024
##
## (Intercept)
## Age *
## PrimaryInsuranceCategoryMEDICAID STATE
## PrimaryInsuranceCategoryMEDICARE *
## PrimaryInsuranceCategoryMEDICARE OTHER
## PrimaryInsuranceCategoryPrivate
## OU_LOS_hrs ***
## DRG01428
## DRG01486
## DRG01558
## DRG01577
## DRG01578
## DRG01599
## DRG01780 ***
## DRG01782
## DRG01786 ***
## DRG01787 *
## DRG01789 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1072.36 on 776 degrees of freedom
## Residual deviance: 788.83 on 759 degrees of freedom
## AIC: 824.83
##
## Number of Fisher Scoring iterations: 5
# Predict probabilities on test data
logistic_pred <- predict(step_model, test_data, type = "response")
# Find optimal cutoff for classification
cutoffs <- seq(0, 1, 0.1)
accuracy <- sapply(cutoffs, function(cut) {
predicted_class <- ifelse(logistic_pred > cut, 1, 0)
# Ensure both predicted and actual classes have the same factor levels
predicted_class <- factor(predicted_class, levels = c(0, 1))
actual_class <- factor(test_data$Flipped, levels = c(0, 1))
# Calculate accuracy
confusionMatrix(predicted_class, actual_class)$overall["Accuracy"]
})
# Print accuracy for each cutoff
data.frame(Cutoff = cutoffs, Accuracy = accuracy)
# Plot accuracy vs cutoff
plot(cutoffs, accuracy, type = "b", col = "blue", xlab = "Cutoff", ylab = "Accuracy",
main = "Accuracy vs Cutoff for Logistic Regression")
abline(v = cutoffs[which.max(accuracy)], col = "red", lty = 2)
legend("topright", legend = paste("Best Cutoff =", cutoffs[which.max(accuracy)]), col = "red", lty = 2)
best_cutoff <- cutoffs[which.max(accuracy)]
logistic_class <- ifelse(logistic_pred > best_cutoff, 1, 0)
conf_matrix_logistic <- confusionMatrix(factor(logistic_class), factor(test_data$Flipped))
print(conf_matrix_logistic)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 151 38
## 1 29 116
##
## Accuracy : 0.7994
## 95% CI : (0.7524, 0.841)
## No Information Rate : 0.5389
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.5946
##
## Mcnemar's Test P-Value : 0.3284
##
## Sensitivity : 0.8389
## Specificity : 0.7532
## Pos Pred Value : 0.7989
## Neg Pred Value : 0.8000
## Prevalence : 0.5389
## Detection Rate : 0.4521
## Detection Prevalence : 0.5659
## Balanced Accuracy : 0.7961
##
## 'Positive' Class : 0
##
library(pROC) # Ensure the pROC package is installed and loaded
# Calculate ROC curve
roc_logistic <- roc(test_data$Flipped, logistic_pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot ROC curve with enhancements
plot(roc_logistic,
main = "ROC Curve for Logistic Regression",
col = "#1c61b6", # Custom color for the ROC curve
lwd = 2, # Thicker line for better visibility
print.auc = TRUE, # Display AUC value on the plot
auc.polygon = TRUE, # Fill the area under the curve
auc.polygon.col = "#1c61b622", # Semi-transparent fill color
print.thres = "best", # Highlight the optimal threshold
print.thres.col = "red", # Color for the optimal threshold
print.thres.adj = c(1.2, 1.2), # Adjust position of the threshold label
grid = TRUE, # Add a grid for better readability
legacy.axes = TRUE) # Use 0-1 for the x-axis (False Positive Rate)
auc_logistic <- auc(roc_logistic)
cat("AUC for Logistic Regression:", auc_logistic, "\n")
## AUC for Logistic Regression: 0.8506854
# Load required libraries
library(rpart) # For building the classification tree
library(rpart.plot) # For visualizing the tree
library(caret) # For confusion matrix
# Build the classification tree
default.tree <- rpart(Flipped ~ Age + PrimaryInsuranceCategory + OU_LOS_hrs + DRG01,
data = train_data, method = "class")
# Predict on the test data
default.tree.pred <- predict(default.tree, test_data, type = "class")
# Visualize the tree
rpart.plot(default.tree,
type = 4, # Display full tree structure
extra = 104, # Show probabilities and percentages
box.palette = "Blues", # Use a color palette
shadow.col = "gray", # Add shadows for depth
nn = TRUE, # Show node numbers
fallen.leaves = TRUE, # Align leaves at the bottom
main = "Classification Tree")
# Confusion Matrix
cfm <- confusionMatrix(default.tree.pred, factor(test_data$Flipped))
print(cfm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 140 30
## 1 40 124
##
## Accuracy : 0.7904
## 95% CI : (0.7428, 0.8328)
## No Information Rate : 0.5389
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.5803
##
## Mcnemar's Test P-Value : 0.2821
##
## Sensitivity : 0.7778
## Specificity : 0.8052
## Pos Pred Value : 0.8235
## Neg Pred Value : 0.7561
## Prevalence : 0.5389
## Detection Rate : 0.4192
## Detection Prevalence : 0.5090
## Balanced Accuracy : 0.7915
##
## 'Positive' Class : 0
##
# Plot confusion matrix with custom colors
plot(cfm$table,
col = c("green", "yellow"), # Custom colors for the confusion matrix
main = "Confusion Matrix",
xlab = "Actual",
ylab = "Predicted")
# Advanced Confusion Matrix Plot using ggplot2
cfm_table <- as.data.frame(cfm$table)
ggplot(cfm_table, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), color = "black", size = 6) +
scale_fill_gradient(low = "white", high = "navy") +
labs(title = "Confusion Matrix",
x = "Actual",
y = "Predicted",
fill = "Frequency") +
theme_minimal() +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold"))
# Calculate additional performance metrics
precision <- cfm$byClass["Pos Pred Value"]
recall <- cfm$byClass["Sensitivity"]
f1_score <- cfm$byClass["F1"]
cat("Precision:", precision, "\n")
## Precision: 0.8235294
cat("Recall:", recall, "\n")
## Recall: 0.7777778
cat("F1-Score:", f1_score, "\n")
## F1-Score: 0.8
# ROC curve
library(ROCR)
predict.tree.prob = predict(default.tree, newdata = test_data, type = "prob")
# plot ROC and compute AUC
library(pROC)
ROC.tree = roc(response = test_data$Flipped, predictor = predict.tree.prob[,2])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(ROC.tree, print.thres = c(0.5))
auc = ROC.tree$auc
auc
## Area under the curve: 0.816
library(randomForest)
library(caret)
library(ggplot2)
# Ensure 'Flipped' is a factor for classification
train_data$Flipped <- as.factor(train_data$Flipped)
test_data$Flipped <- as.factor(test_data$Flipped)
# Tune 'mtry' parameter for optimal model performance
best_mtry <- tuneRF(train_data[, c("Age", "PrimaryInsuranceCategory", "OU_LOS_hrs", "DRG01")],
train_data$Flipped,
stepFactor = 1.5,
improve = 0.01,
ntreeTry = 100,
trace = FALSE,
plot = FALSE)
## -0.02369668 0.01
optimal_mtry <- best_mtry[which.min(best_mtry[, 2]), 1] # Extract best 'mtry' value
# Train Random Forest Model with Optimal Parameters
rf <- randomForest(Flipped ~ Age + PrimaryInsuranceCategory + OU_LOS_hrs + DRG01,
data = train_data,
ntree = 500,
mtry = optimal_mtry,
nodesize = 5,
importance = TRUE)
# Improved Variable Importance Plot
varImpPlot(rf, type = 1, main = "Feature Importance", col = "blue")
# Predictions on Test Set
rf.pred <- predict(rf, test_data)
# Enhanced Confusion Matrix Output
conf_matrix <- confusionMatrix(factor(rf.pred), factor(test_data$Flipped))
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 144 35
## 1 36 119
##
## Accuracy : 0.7874
## 95% CI : (0.7396, 0.8301)
## No Information Rate : 0.5389
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.5725
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8000
## Specificity : 0.7727
## Pos Pred Value : 0.8045
## Neg Pred Value : 0.7677
## Prevalence : 0.5389
## Detection Rate : 0.4311
## Detection Prevalence : 0.5359
## Balanced Accuracy : 0.7864
##
## 'Positive' Class : 0
##
# Visualize Confusion Matrix with ggplot
conf_matrix_df <- as.data.frame(conf_matrix$table)
ggplot(conf_matrix_df, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), vjust = 1, size = 5) +
scale_fill_gradient(low = "white", high = "blue") +
labs(title = "Confusion Matrix - Random Forest", x = "Actual", y = "Predicted") +
theme_minimal()
## Naive Bayes
library(caret)
library(e1071)
# Running Naive Bayes
naive <- naiveBayes(Flipped ~., data = train_data)
# Predicting probabilities
pred.naive.prob <- predict(naive, newdata = test_data, type = "raw")
# Predicting class membership
pred.naive.class <- predict(naive, newdata = test_data)
#Confusion matrix
confusionMatrix(factor(pred.naive.class), factor(test_data$Flipped))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 179 1
## 1 1 153
##
## Accuracy : 0.994
## 95% CI : (0.9785, 0.9993)
## No Information Rate : 0.5389
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.988
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9944
## Specificity : 0.9935
## Pos Pred Value : 0.9944
## Neg Pred Value : 0.9935
## Prevalence : 0.5389
## Detection Rate : 0.5359
## Detection Prevalence : 0.5389
## Balanced Accuracy : 0.9940
##
## 'Positive' Class : 0
##
# library(openxlsx)
# write.xlsx(cbind(test_data, predict.tree.prob) , file = "test_result.xlsx")