library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(caret)
## Loading required package: lattice
library(corrplot)
## corrplot 0.95 loaded
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(e1071)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(readr)
OUData <- read_csv("OUData.csv")
## Rows: 1111 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): ObservationRecordKey, Gender, PrimaryInsuranceCategory, InitPatien...
## dbl  (5): Age, Flipped, OU_LOS_hrs, DRG01, BloodPressureLower
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
OUD<- OUData

Data Summary

summary(OUD)
##  ObservationRecordKey      Age           Gender         
##  Length:1111          Min.   :19.00   Length:1111       
##  Class :character     1st Qu.:57.00   Class :character  
##  Mode  :character     Median :73.00   Mode  :character  
##                       Mean   :69.03                     
##                       3rd Qu.:86.00                     
##                       Max.   :89.00                     
##  PrimaryInsuranceCategory InitPatientClassAndFirstPostOUClass    Flipped      
##  Length:1111              Length:1111                         Min.   :0.0000  
##  Class :character         Class :character                    1st Qu.:0.0000  
##  Mode  :character         Mode  :character                    Median :0.0000  
##                                                               Mean   :0.4608  
##                                                               3rd Qu.:1.0000  
##                                                               Max.   :1.0000  
##    OU_LOS_hrs         DRG01       BloodPressureUpper BloodPressureLower
##  Min.   :  1.20   Min.   :276.0   Length:1111        Min.   :  0.00    
##  1st Qu.: 24.40   1st Qu.:577.0   Class :character   1st Qu.: 65.00    
##  Median : 45.80   Median :780.0   Mode  :character   Median : 74.00    
##  Mean   : 62.25   Mean   :664.9                      Mean   : 75.27    
##  3rd Qu.: 84.40   3rd Qu.:786.0                      3rd Qu.: 85.00    
##  Max.   :405.50   Max.   :789.0                      Max.   :146.00    
##  BloodPressureDiff     Pulse           PulseOximetry      Respirations      
##  Length:1111        Length:1111        Length:1111        Length:1111       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Temperature       
##  Length:1111       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
str(OUD)
## spc_tbl_ [1,111 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ObservationRecordKey               : chr [1:1111] "905459x1" "443621z2" "131565z1" "438080x1" ...
##  $ Age                                : num [1:1111] 54 89 83 89 81 54 83 50 52 85 ...
##  $ Gender                             : chr [1:1111] "Male" "Female" "Female" "Female" ...
##  $ PrimaryInsuranceCategory           : chr [1:1111] "MEDICAID STATE" "MEDICARE" "MEDICARE" "MEDICARE" ...
##  $ InitPatientClassAndFirstPostOUClass: chr [1:1111] "OBSERVATION->INPATIENT" "OBSERVATION->OBSERVATION" "OBSERVATION->INPATIENT" "OBSERVATION->OBSERVATION" ...
##  $ Flipped                            : num [1:1111] 1 0 1 0 0 0 1 1 0 1 ...
##  $ OU_LOS_hrs                         : num [1:1111] 37.3 89.7 96.3 13.3 25.4 ...
##  $ DRG01                              : num [1:1111] 428 599 786 780 428 786 558 780 786 780 ...
##  $ BloodPressureUpper                 : chr [1:1111] "153" "123" "105" "162" ...
##  $ BloodPressureLower                 : num [1:1111] 111 64 55 73 60 56 61 48 85 60 ...
##  $ BloodPressureDiff                  : chr [1:1111] "26" "68" "29" "83" ...
##  $ Pulse                              : chr [1:1111] "73" "86" "81" "76" ...
##  $ PulseOximetry                      : chr [1:1111] "100" "94" "94" "97" ...
##  $ Respirations                       : chr [1:1111] "18" "18" "18" "24" ...
##  $ Temperature                        : chr [1:1111] "98.2" "95.9" "97.3" "98.1" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ObservationRecordKey = col_character(),
##   ..   Age = col_double(),
##   ..   Gender = col_character(),
##   ..   PrimaryInsuranceCategory = col_character(),
##   ..   InitPatientClassAndFirstPostOUClass = col_character(),
##   ..   Flipped = col_double(),
##   ..   OU_LOS_hrs = col_double(),
##   ..   DRG01 = col_double(),
##   ..   BloodPressureUpper = col_character(),
##   ..   BloodPressureLower = col_double(),
##   ..   BloodPressureDiff = col_character(),
##   ..   Pulse = col_character(),
##   ..   PulseOximetry = col_character(),
##   ..   Respirations = col_character(),
##   ..   Temperature = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
OUD <- OUD %>%
  mutate(
    across(c(BloodPressureUpper, BloodPressureDiff, Pulse, PulseOximetry, Respirations, Temperature), 
           ~ as.numeric(gsub("[^0-9.]", "", .)))
  )

Transform Variable Types

OUD <- OUD %>%
  mutate(
    across(c(Gender, PrimaryInsuranceCategory, InitPatientClassAndFirstPostOUClass, Flipped, DRG01), as.factor),
    across(c(BloodPressureUpper, BloodPressureLower, BloodPressureDiff, Pulse, PulseOximetry, Respirations, Temperature), as.numeric)
  )

Check for Duplicates and Missing Values

# Assuming OUD is your dataset
missing_values <- data.frame(Variable = names(OUD), Missing_Values = colSums(is.na(OUD)))

# Load the kableExtra package
library(kableExtra)

# Create a styled table with kableExtra
kable(missing_values, caption = "Number of Missing Values") %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "center") %>%
  column_spec(1, width = "25em") %>%
  column_spec(2, width = "10em") %>%
  row_spec(0, bold = T, color = "white", background = "#2C3E50") %>%
  row_spec(1:nrow(missing_values), background = "#ECF0F1")
Number of Missing Values
Variable Missing_Values
ObservationRecordKey ObservationRecordKey 0
Age Age 0
Gender Gender 0
PrimaryInsuranceCategory PrimaryInsuranceCategory 0
InitPatientClassAndFirstPostOUClass InitPatientClassAndFirstPostOUClass 0
Flipped Flipped 0
OU_LOS_hrs OU_LOS_hrs 0
DRG01 DRG01 0
BloodPressureUpper BloodPressureUpper 3
BloodPressureLower BloodPressureLower 0
BloodPressureDiff BloodPressureDiff 4
Pulse Pulse 4
PulseOximetry PulseOximetry 1
Respirations Respirations 4
Temperature Temperature 4

Data Visualization

# Histogram of Pulse Oximetry with units
hist_data <- OUD$PulseOximetry

hst <- hist(hist_data, 
            main = "Enhanced Histogram of Pulse Oximetry",
            xlab = "Pulse Oximetry (%)",  # Adding units for Pulse Oximetry
            ylab = "Frequency",
            col = rgb(0.2, 0.4, 0.8, 0.5),  
            border = "darkblue",
            breaks = 20,  
            prob = TRUE)  

# Adding density plot
lines(density(hist_data, na.rm = TRUE), 
      col = "red", 
      lwd = 2)

# Adding normal distribution curve
xfit <- seq(min(hist_data, na.rm = TRUE), max(hist_data, na.rm = TRUE), length = 100)
yfit <- dnorm(xfit, mean = mean(hist_data, na.rm = TRUE), sd = sd(hist_data, na.rm = TRUE))
lines(xfit, yfit, col = "darkgreen", lwd = 2, lty = 2)

# Adding grid and legend
grid(col = "gray", lty = "dotted")

legend("topright", legend = c("Density Plot", "Normal Curve"), 
       col = c("red", "darkgreen"), lwd = 2, lty = c(1, 2))

# Histogram of Respirations with units
hist_data2 <- OUD$Respirations

hst2 <- hist(hist_data2, 
             main = "Enhanced Histogram of Respirations",
             xlab = "Respirations (breaths per minute)",  # Adding units for Respirations
             ylab = "Density",
             col = rgb(0.2, 0.4, 0.8, 0.5),  
             border = "darkblue",
             breaks = 20,  
             prob = TRUE)  

# Adding density plot
lines(density(hist_data2, na.rm = TRUE), 
      col = "red", 
      lwd = 2)

# Adding normal distribution curve
xfit2 <- seq(min(hist_data2, na.rm = TRUE), max(hist_data2, na.rm = TRUE), length = 100)
yfit2 <- dnorm(xfit2, mean = mean(hist_data2, na.rm = TRUE), sd = sd(hist_data2, na.rm = TRUE))
lines(xfit2, yfit2, col = "darkgreen", lwd = 2, lty = 2)

# Adding grid and legend
grid(col = "gray", lty = "dotted")

legend("topright", legend = c("Density Plot", "Normal Curve"), 
       col = c("red", "darkgreen"), lwd = 2, lty = c(1, 2))

# Histogram of Blood Pressure (Upper) after scaling
hst3 <- hist(OUD$BloodPressureUpper,
             main = "Enhanced Histogram of Blood Pressure (Upper)",
             xlab = "Blood Pressure (Upper) (mmHg)",  # Adding units for Blood Pressure (Upper)
             ylab = "Density", 
             col = rgb(0.2, 0.4, 0.8, 0.5),
             border = "blue", 
             breaks = 20,
             prob = TRUE)  

# Adding density plot
lines(density(OUD$BloodPressureUpper, na.rm = TRUE), 
      col = "red", 
      lwd = 2)

# Adding normal distribution curve
xfit3 <- seq(min(OUD$BloodPressureUpper, na.rm = TRUE), max(OUD$BloodPressureUpper, na.rm = TRUE), length = 100)
yfit3 <- dnorm(xfit3, mean = mean(OUD$BloodPressureUpper, na.rm = TRUE), sd = sd(OUD$BloodPressureUpper, na.rm = TRUE))
lines(xfit3, yfit3, col = "darkgreen", lwd = 2, lty = 2)

# Adding grid and legend
grid(col = "gray", lty = "dotted")

legend("topright", legend = c("Density Plot", "Normal Curve"), 
       col = c("red", "darkgreen"), lwd = 2, lty = c(1, 2))

# Histogram of Pulse Rate
hst4 <- hist(OUD$Pulse,
             main = "Enhanced Histogram of Pulse Rate",
             xlab = "Pulse (beats per minute)",  # Adding units for Pulse
             ylab = "Density", 
             col = rgb(0.2, 0.4, 0.8, 0.5),
             border = "blue", 
             breaks = 20,
             prob = TRUE)  

# Adding density plot
lines(density(OUD$Pulse, na.rm = TRUE), 
      col = "red", 
      lwd = 2)

# Adding normal distribution curve
xfit4 <- seq(min(OUD$Pulse, na.rm = TRUE), max(OUD$Pulse, na.rm = TRUE), length = 100)
yfit4 <- dnorm(xfit4, mean = mean(OUD$Pulse, na.rm = TRUE), sd = sd(OUD$Pulse, na.rm = TRUE))
lines(xfit4, yfit4, col = "darkgreen", lwd = 2, lty = 2)

# Adding grid and legend
grid(col = "gray", lty = "dotted")

legend("topright", legend = c("Density Plot", "Normal Curve"), 
       col = c("red", "darkgreen"), lwd = 2, lty = c(1, 2))

# Histogram of Body Temperature
hst5 <- hist(OUD$Temperature,
             main = "Enhanced Histogram of Body Temperature",
             xlab = "Temperature (°C)",  # Adding units for Temperature
             ylab = "Density", 
             col = rgb(0.2, 0.4, 0.8, 0.5),
             border = "blue", 
             breaks = 20,
             prob = TRUE)  

# Adding density plot
lines(density(OUD$Temperature, na.rm = TRUE), 
      col = "red", 
      lwd = 2)

# Adding normal distribution curve
xfit5 <- seq(min(OUD$Temperature, na.rm = TRUE), max(OUD$Temperature, na.rm = TRUE), length = 100)
yfit5 <- dnorm(xfit5, mean = mean(OUD$Temperature, na.rm = TRUE), sd = sd(OUD$Temperature, na.rm = TRUE))
lines(xfit5, yfit5, col = "darkgreen", lwd = 2, lty = 2)

# Adding grid and legend
grid(col = "gray", lty = "dotted")

legend("topright", legend = c("Density Plot", "Normal Curve"), 
       col = c("red", "darkgreen"), lwd = 2, lty = c(1, 2))

## NA values with Median As the variables are Skewed

library(dplyr)

OUD <- OUD %>%
  mutate(across(c(PulseOximetry, Respirations, BloodPressureDiff, BloodPressureUpper, Pulse, Temperature), 
                ~ ifelse(is.na(.), median(., na.rm = TRUE), .), 
                .names = "filled_{.col}"))  # Adds '_filled' prefix for clarity

# Print the updated OUD dataset
print(OUD)
## # A tibble: 1,111 × 21
##    ObservationRecordKey   Age Gender PrimaryInsuranceCategory
##    <chr>                <dbl> <fct>  <fct>                   
##  1 905459x1                54 Male   MEDICAID STATE          
##  2 443621z2                89 Female MEDICARE                
##  3 131565z1                83 Female MEDICARE                
##  4 438080x1                89 Female MEDICARE                
##  5 763005z1                81 Female MEDICARE OTHER          
##  6 244227z1                54 Female MEDICARE OTHER          
##  7 448887x1                83 Female MEDICAID STATE          
##  8 371392x1                50 Female MEDICARE                
##  9 859289z1                52 Male   Private                 
## 10 436418z1                85 Female MEDICARE OTHER          
## # ℹ 1,101 more rows
## # ℹ 17 more variables: InitPatientClassAndFirstPostOUClass <fct>,
## #   Flipped <fct>, OU_LOS_hrs <dbl>, DRG01 <fct>, BloodPressureUpper <dbl>,
## #   BloodPressureLower <dbl>, BloodPressureDiff <dbl>, Pulse <dbl>,
## #   PulseOximetry <dbl>, Respirations <dbl>, Temperature <dbl>,
## #   filled_PulseOximetry <dbl>, filled_Respirations <dbl>,
## #   filled_BloodPressureDiff <dbl>, filled_BloodPressureUpper <dbl>, …
# Checking missing values in all columns of OUD
missing_values <- colSums(is.na(OUD)) > 0

# Print the columns with missing values
print(missing_values)
##                ObservationRecordKey                                 Age 
##                               FALSE                               FALSE 
##                              Gender            PrimaryInsuranceCategory 
##                               FALSE                               FALSE 
## InitPatientClassAndFirstPostOUClass                             Flipped 
##                               FALSE                               FALSE 
##                          OU_LOS_hrs                               DRG01 
##                               FALSE                               FALSE 
##                  BloodPressureUpper                  BloodPressureLower 
##                                TRUE                               FALSE 
##                   BloodPressureDiff                               Pulse 
##                                TRUE                                TRUE 
##                       PulseOximetry                        Respirations 
##                                TRUE                                TRUE 
##                         Temperature                filled_PulseOximetry 
##                                TRUE                               FALSE 
##                 filled_Respirations            filled_BloodPressureDiff 
##                               FALSE                               FALSE 
##           filled_BloodPressureUpper                        filled_Pulse 
##                               FALSE                               FALSE 
##                  filled_Temperature 
##                               FALSE
# Check the column names in OUD
print(colnames(OUD))
##  [1] "ObservationRecordKey"                "Age"                                
##  [3] "Gender"                              "PrimaryInsuranceCategory"           
##  [5] "InitPatientClassAndFirstPostOUClass" "Flipped"                            
##  [7] "OU_LOS_hrs"                          "DRG01"                              
##  [9] "BloodPressureUpper"                  "BloodPressureLower"                 
## [11] "BloodPressureDiff"                   "Pulse"                              
## [13] "PulseOximetry"                       "Respirations"                       
## [15] "Temperature"                         "filled_PulseOximetry"               
## [17] "filled_Respirations"                 "filled_BloodPressureDiff"           
## [19] "filled_BloodPressureUpper"           "filled_Pulse"                       
## [21] "filled_Temperature"
# Remove unuseful columns (if they exist)
OUD_clean <- OUD %>%
  select(-one_of("ObservationRecordKey", "InitPatientClassAndFirstPostOUClass"))
library(ggplot2)

# Histogram of Age, colored by Flipped
ggplot(OUD, aes(x = Age, fill = Flipped)) +
  geom_histogram(position = "identity", alpha = 0.7, bins = 25, color = "black", size = 0.3) +  # Adjusted bin size and alpha
  labs(title = "Age Distribution by Flipped Status",
       subtitle = "Comparison of patients who flipped vs. those who did not",
       x = "Age (years)",
       y = "Frequency") +
  scale_fill_manual(values = c("#FF6347", "#4682B4"), labels = c("Not Flipped", "Flipped")) +  # Added color contrast
  theme_minimal(base_size = 14) +  # Changed base size for better readability
  theme(plot.title = element_text(hjust = 0.5, face = "bold", color = "#333333", size = 20),
        plot.subtitle = element_text(hjust = 0.5, color = "#555555", size = 14),
        axis.title = element_text(face = "bold", color = "#444444"),
        axis.text = element_text(color = "#666666"),
        legend.title = element_blank(),
        legend.text = element_text(color = "#444444"),
        panel.grid.major = element_line(color = "#dddddd", linetype = "dotted"),
        panel.grid.minor = element_blank(),
        panel.background = element_rect(fill = "#f9f9f9", color = "#f9f9f9"),
        plot.background = element_rect(fill = "#f9f9f9", color = "#f9f9f9")) +
  annotate("text", x = 60, y = 60, label = "Flipped patients tend to be younger", size = 5, color = "#4682B4", fontface = "italic")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

library(ggplot2)
library(dplyr)

# Calculate the proportion of patients flipped to inpatient status by DRG code
proportion_data <- OUD %>%
  group_by(DRG01) %>%
  summarise(Proportion_Inpatient = mean(Flipped == 1, na.rm = TRUE)) 

# Sort the data by proportion in descending order to highlight the highest
proportion_data <- proportion_data %>%
  arrange(desc(Proportion_Inpatient))

# Reorder DRG01 factor levels based on Proportion_Inpatient
proportion_data$DRG01 <- factor(proportion_data$DRG01, levels = proportion_data$DRG01)

# Create the enhanced bar chart
ggplot(proportion_data, aes(x = DRG01, y = Proportion_Inpatient, fill = Proportion_Inpatient)) +
  geom_bar(stat = "identity", color = "white", show.legend = FALSE) +
  scale_fill_gradient(low = "#FF6347", high = "#4682B4") +  # Color gradient based on proportion
  labs(title = "Proportion of Patients Flipped to Inpatient Status by DRG Code",
       subtitle = "Higher proportions of flipped patients are shown in darker blue",
       x = "DRG Code",
       y = "Proportion of Inpatients") +
  theme_minimal(base_size = 14) +
  theme(
    axis.title.x = element_text(face = "bold", size = 14, color = "#333333"),
    axis.title.y = element_text(face = "bold", size = 14, color = "#333333"),
    axis.text.x = element_text(angle = 45, hjust = 1, color = "#444444"),  # Angled x-axis labels for better readability
    plot.title = element_text(face = "bold", size = 16, color = "#333333", hjust = 0.5),
    plot.subtitle = element_text(size = 12, color = "#555555", hjust = 0.5),
    panel.grid.major = element_line(color = "#eeeeee", linetype = "dotted"),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "#f9f9f9", color = "#f9f9f9"),
    plot.background = element_rect(fill = "#f9f9f9", color = "#f9f9f9")
  ) +
  geom_text(aes(label = scales::percent(Proportion_Inpatient, accuracy = 0.1)), vjust = -0.5, color = "#333333", fontface = "bold")

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ggbeeswarm)

# Define RGB colors for orange and green
orange_rgb <- rgb(255, 165, 0, maxColorValue = 255)  # Orange
green_rgb <- rgb(0, 128, 0, maxColorValue = 255)     # Green
# Pulse Oximetry and Flipping Rate (Density Plot)
p1 <- ggplot(OUD_clean, aes(x = PulseOximetry, fill = factor(Flipped))) +
  geom_density(alpha = 0.6) +
  scale_fill_manual(values = c(orange_rgb, green_rgb)) +
  labs(title = "Pulse Oximetry Distribution by Flipping Rate",
       x = "Pulse Oximetry", y = "Density") +
  theme_minimal(base_size = 8) +
  theme(
    axis.title.x = element_text(face = "bold", size = 14, color = "#333333"),
    axis.title.y = element_text(face = "bold", size = 14, color = "#333333"),
    plot.title = element_text(face = "bold", size = 18, color = "#333333", hjust = 0.5),
    axis.text.x = element_text(color = "#444444", size = 12),
    axis.text.y = element_text(color = "#444444", size = 12),
    panel.grid.major = element_line(color = "#eeeeee", linetype = "dotted", linewidth = 0.8),
    panel.grid.minor = element_blank(),
    plot.background = element_rect(fill = "#f9f9f9"),
    panel.background = element_rect(fill = "#ffffff")
  )
plot(p1)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).

# Length of Stay and Flipping Rate (Density Plot)
p2 <- ggplot(OUD_clean, aes(x = OU_LOS_hrs, fill = factor(Flipped))) +
  geom_density(alpha = 0.6) +
  scale_fill_manual(values = c(orange_rgb, green_rgb)) +
  labs(title = "Length of Stay Distribution by Flipping Rate",
       x = "Length of Stay (hrs)", y = "Density") +
  theme_minimal(base_size = 8) +
  theme(
    axis.title.x = element_text(face = "bold", size = 14, color = "#333333"),
    axis.title.y = element_text(face = "bold", size = 14, color = "#333333"),
    plot.title = element_text(face = "bold", size = 18, color = "#333333", hjust = 0.5),
    axis.text.x = element_text(color = "#444444", size = 12),
    axis.text.y = element_text(color = "#444444", size = 12),
    panel.grid.major = element_line(color = "#eeeeee", linetype = "dotted", linewidth = 0.8),
    panel.grid.minor = element_blank(),
    plot.background = element_rect(fill = "#f9f9f9"),
    panel.background = element_rect(fill = "#ffffff")
  )
plot(p2)

# Blood Pressure Difference and Flipping Rate (Violin Plot)
p3 <- ggplot(OUD_clean, aes(x = factor(Flipped), y = BloodPressureDiff, fill = factor(Flipped))) +
  geom_violin(trim = FALSE, color = "black") +
  scale_fill_manual(values = c(orange_rgb, green_rgb)) +
  labs(title = "Blood Pressure Difference vs Flipping Rate",
       x = "Inpatient Status", y = "Blood Pressure Difference (mmHg)") +
  theme_minimal(base_size = 8) +
  theme(
    axis.title.x = element_text(face = "bold", size = 14, color = "#333333"),
    axis.title.y = element_text(face = "bold", size = 14, color = "#333333"),
    plot.title = element_text(face = "bold", size = 18, color = "#333333", hjust = 0.5),
    axis.text.x = element_text(color = "#444444", size = 12),
    axis.text.y = element_text(color = "#444444", size = 12),
    panel.grid.major = element_line(color = "#eeeeee", linetype = "dotted", linewidth = 0.8),
    panel.grid.minor = element_blank(),
    plot.background = element_rect(fill = "#f9f9f9"),
    panel.background = element_rect(fill = "#ffffff")
  )
plot(p3)
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

# Temperature and Flipping Rate (Dot Plot / Jitter Plot)
p4 <- ggplot(OUD_clean, aes(x = factor(Flipped), y = Temperature, color = factor(Flipped))) +
  geom_jitter(width = 0.1, size = 2, alpha = 0.6) +
  scale_color_manual(values = c(orange_rgb, green_rgb)) +
  labs(title = "Temperature vs Flipping Rate",
       x = "Inpatient Status", y = "Temperature (°C)") +
  theme_minimal(base_size = 8) +
  theme(
    axis.title.x = element_text(face = "bold", size = 14, color = "#333333"),
    axis.title.y = element_text(face = "bold", size = 14, color = "#333333"),
    plot.title = element_text(face = "bold", size = 18, color = "#333333", hjust = 0.5),
    axis.text.x = element_text(color = "#444444", size = 12),
    axis.text.y = element_text(color = "#444444", size = 12),
    panel.grid.major = element_line(color = "#eeeeee", linetype = "dotted", linewidth = 0.8),
    panel.grid.minor = element_blank(),
    plot.background = element_rect(fill = "#f9f9f9"),
    panel.background = element_rect(fill = "#ffffff")
  )
plot(p4)
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Blood Pressure Difference and Flipping Rate (Bee Swarm Plot)
p5 <- ggplot(OUD_clean, aes(x = factor(Flipped), y = BloodPressureDiff, color = factor(Flipped))) +
  geom_beeswarm(size = 2, alpha = 0.6) +
  scale_color_manual(values = c(orange_rgb, green_rgb)) +
  labs(title = "Blood Pressure Difference vs Flipping Rate",
       x = "Inpatient Status", y = "Blood Pressure Difference (mmHg)") +
  theme_minimal(base_size = 8) +
  theme(
    axis.title.x = element_text(face = "bold", size = 14, color = "#333333"),
    axis.title.y = element_text(face = "bold", size = 14, color = "#333333"),
    plot.title = element_text(face = "bold", size = 18, color = "#333333", hjust = 0.5),
    axis.text.x = element_text(color = "#444444", size = 12),
    axis.text.y = element_text(color = "#444444", size = 12),
    panel.grid.major = element_line(color = "#eeeeee", linetype = "dotted", linewidth = 0.8),
    panel.grid.minor = element_blank(),
    plot.background = element_rect(fill = "#f9f9f9"),
    panel.background = element_rect(fill = "#ffffff")
  )
plot(p5)
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

flipped_counts <- OUD %>% 
  group_by(PrimaryInsuranceCategory, Flipped) %>%
  summarise(Count = n(), .groups = "drop") %>%
  mutate(Flipped = factor(Flipped, labels = c("Not Flipped", "Flipped")))  # Convert to factor with labels

ggplot(flipped_counts, aes(x = PrimaryInsuranceCategory, y = Count, fill = Flipped)) + 
  geom_bar(stat = "identity", position = "stack", color = "white", size = 0.5) +
  scale_fill_manual(values = c('#4B0082', '#008080'),  # Dark purple and teal for bars
                    name = "Flipped Status") +
  scale_y_continuous(labels = scales::comma) +  # Format y-axis with commas for better readability
  labs(x = "Primary Insurance Category", y = "Patient Count", 
       title = "Flipped Status Counts by Primary Insurance Category") +
  theme_minimal(base_size = 5) +
  theme(
    axis.title.x = element_text(face = "italic", size = 16, color = "#FFFFFF", family = "Helvetica"),
    axis.title.y = element_text(face = "italic", size = 16, color = "#FFFFFF", family = "Helvetica"),
    plot.title = element_text(face = "bold", size = 20, color = "#FFFFFF", hjust = 0.5, family = "Georgia"),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 14, color = "#E0E0E0", family = "Arial"),
    axis.text.y = element_text(size = 14, color = "#E0E0E0", family = "Arial"),
    panel.grid.major = element_line(color = "#444444", linetype = "dotted", linewidth = 1),  # Darker grid lines
    panel.grid.minor = element_blank(),
    plot.background = element_rect(fill = "#212121"),  # Dark background
    panel.background = element_rect(fill = "#2C2C2C"),  # Dark panel background
    legend.title = element_text(size = 14, face = "bold", color = "#FFFFFF"),
    legend.text = element_text(size = 12, color = "#E0E0E0")
  ) +
  geom_text(aes(label = scales::comma(Count)), 
            position = position_stack(vjust = 0.5), 
            size = 5, 
            color = "#FFFFFF", 
            fontface = "bold", 
            family = "Arial") +  # Adding text labels inside bars for count visualization
  theme(legend.position = "top",  # Move the legend to the top
        legend.background = element_rect(fill = "#2C2C2C", color = "black", size = 0.5),  # Dark legend background
        legend.key = element_rect(fill = "#2C2C2C", color = "black"))  # Dark legend key
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

##Convert the Flipped variable into a factor to ensure it is recognized as a categorical variable.

library(dplyr)
library(ggplot2)
library(scales)  # For formatting numbers with commas
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
# Convert Flipped variable to factor to ensure it's treated as categorical
OUD$Flipped <- factor(OUD$Flipped, levels = c(0, 1), labels = c("Not Flipped", "Flipped"))

# Calculate percentage of flipped patients within each primary insurance category
flipped_percentages <- OUD %>%
  group_by(PrimaryInsuranceCategory, Flipped) %>%
  summarise(Count = n(), .groups = "drop") %>%
  mutate(Percentage = (Count / sum(Count)) * 100) 

# Plot bar chart with percentages
ggplot(flipped_percentages, aes(x = PrimaryInsuranceCategory, y = Percentage, fill = Flipped)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.8), color = "white", size = 1) +
  geom_text(aes(label = paste0(round(Percentage, 2), "%")), 
            position = position_dodge(width = 0.8), 
            vjust = -0.5, 
            size = 4, 
            fontface = "bold", 
            color = "white") +  # White text labels for better contrast
  scale_fill_manual(values = c('#1E1E1E', '#5F9EA0'),  # Dark gray and teal for a sleek look
                    labels = c('Not Flipped', 'Flipped'),
                    name = "Flipped Status") +
  scale_y_continuous(labels = scales::comma) +  # Format y-axis with commas for better readability
  labs(x = "Primary Insurance Category", 
       y = "Percentage of Flipped Patients", 
       title = "Percentage of Flipped Patients by Primary Insurance Category") +
  theme_minimal(base_size = 16) +
  theme(
    plot.background = element_rect(fill = "#212121"),  # Dark background for the entire plot
    panel.background = element_rect(fill = "#2C2C2C"),  # Dark background for the panel area
    axis.title.x = element_text(face = "bold", size = 14, color = "#FFFFFF", family = "Helvetica"),
    axis.title.y = element_text(face = "bold", size = 14, color = "#FFFFFF", family = "Helvetica"),
    plot.title = element_text(face = "bold", size = 16, color = "#FFFFFF", hjust = 0.5, family = "Georgia"),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12, color = "#E0E0E0", family = "Arial"),
    axis.text.y = element_text(size = 12, color = "#E0E0E0", family = "Arial"),
    panel.grid.major = element_line(color = "#444444", linetype = "dotted", linewidth = 1),  # Dark grid lines
    panel.grid.minor = element_blank(),
    legend.title = element_text(size = 14, face = "bold", color = "#FFFFFF"),
    legend.text = element_text(size = 12, color = "#E0E0E0"),
    legend.background = element_rect(fill = "#2C2C2C", color = "black", size = 0.5),  # Dark legend background
    legend.key = element_rect(fill = "#2C2C2C", color = "black")  # Dark legend key
  )

## Plot for Pulse Oximetry vs Length of Stay

OUD_clean <- OUD %>% filter(!is.na(OU_LOS_hrs) & !is.na(Age))

ggplot(OUD_clean, aes(x = OU_LOS_hrs, y = Age)) +
  geom_point(aes(color = as.factor(Flipped)), size = 3, alpha = 0.7) +
  scale_color_manual(values = c("#90EE90", "#00008B"), 
                     labels = c("Not Flipped", "Flipped"), 
                     name = "Flipped Status") +
  geom_smooth(method = "lm", color = "#FFA500", se = FALSE, linetype = "dashed", size = 1) +
  labs(
    x = "Length of Stay in Observation Unit (Hours)", 
    y = "Patient Age (Years)", 
    title = "Age vs Length of Stay in Observation Unit"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

library(ggplot2)
library(dplyr)

# Create a frequency table for heatmap
heatmap_data <- OUD %>%
  group_by(Gender, Flipped) %>%
  summarise(Count = n(), .groups = "drop")

# Generate a heatmap
ggplot(heatmap_data, aes(x = Gender, y = as.factor(Flipped), fill = Count)) +
  geom_tile(color = "black", size = 0.8) +  # Add black borders for clarity
  geom_text(aes(label = Count), size = 6, color = "white") +  # Show count inside cells
  scale_fill_gradient(low = "#90EE90", high = "#00008B") +
  labs(x = "Gender", y = "Flipped Status", fill = "Count",
       title = "Flipped Status Heatmap by Gender",
       subtitle = "Intensity Based on Patient Counts") +
  theme_minimal(base_size = 16) +
  theme(
    plot.title = element_text(face = "bold", size = 18, hjust = 0.5),
    plot.subtitle = element_text(face = "italic", size = 14, hjust = 0.5),
    axis.title.x = element_text(face = "bold", size = 14),
    axis.title.y = element_text(face = "bold", size = 14),
    legend.title = element_text(face = "bold"),
    legend.background = element_rect(fill = "white", color = "black"),
    legend.text = element_text(size = 12)
  )

Correlation Matrix

library(corrplot)

# Select numerical variables
numerical_vars <- OUD[, sapply(OUD, is.numeric)]

# Calculate the correlation matrix
correlation_matrix <- cor(numerical_vars, use = "complete.obs")

# Define a custom diverging color palette
col <- colorRampPalette(c("#6D9EC1", "white", "#E46726"))(200)

# Increase the size of the plotting device (for macOS)
quartz(width = 10, height = 20)  # Set plot size to 12x10 inches

# Adjust margins for better spacing
par(mar = c(2, 2, 3, 2))
corrplot(correlation_matrix, 
         method = "color",        
         type = "upper",          
         addCoef.col = "black",   
         tl.col = "black",        
         tl.srt = 45,             
         number.cex = 0.5,        # Smaller correlation coefficients
         tl.cex = 0.7,            # Smaller variable labels
         diag = FALSE,            
         col = col,               
         tl.offset = 0.5,         
         cl.pos = "r",            
         cl.ratio = 0.2,          
         cl.length = 5,           
         cl.align.text = "c",     
         cl.cex = 0.8,            
         addgrid.col = "gray90",  
         mar = c(0, 0, 1, 0)      
)

## Dependent variable “FLipped”

# Load necessary libraries
library(ggplot2)
library(dplyr)

# Frequency and proportion table
tbl <- table(OUD$Flipped)
prop_tbl <- round(prop.table(tbl), 3)  # More precision

# Convert to dataframe for plotting
df <- as.data.frame(tbl)
colnames(df) <- c("Flipped", "Count")
df$Proportion <- prop_tbl[df$Flipped]  # Add proportion column

# Create the bar chart with proportion labels
ggplot(df, aes(x = as.factor(Flipped), y = Count, fill = as.factor(Flipped))) +
  geom_bar(stat = "identity", width = 0.5, show.legend = FALSE) +
  geom_text(aes(label = paste0(Count, " (", Proportion * 100, "%)")), vjust = -0.5, size = 5, fontface = "bold") +
  scale_fill_manual(values = c("navy", "darkred"), labels = c("Stayed in Observation", "Flipped to Inpatient")) +
  scale_y_continuous(sec.axis = sec_axis(~ . / sum(tbl), name = "Proportion")) + 
  labs(title = "Proportion of Non-Flipped and Flipped Patients",
       subtitle = paste0("Total Patients: ", sum(tbl)),
       x = "Patient Status",
       y = "Count",
       caption = "Proportions displayed as percentages") +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5, size = 16),
    plot.subtitle = element_text(hjust = 0.5, size = 12),
    axis.title.x = element_text(face = "bold"),
    axis.title.y = element_text(face = "bold"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

## Data Partion

# Load necessary libraries
library(caret)

# Set seed for reproducibility
set.seed(123)

# Convert 'Flipped' column to binary (0,1)
OUD$Flipped <- ifelse(OUD$Flipped == "Flipped", 1, 0)

# Generate random indexes for training (70%) and testing (30%) sets
train.index <- sample(nrow(OUD), 0.7 * nrow(OUD))
test.index <- setdiff(seq_len(nrow(OUD)), train.index)

# Split the data
train_data <- OUD[train.index, ]
test_data <- OUD[test.index, ]

# Convert target variable to factor
train_data$Flipped <- as.factor(train_data$Flipped)
test_data$Flipped <- as.factor(test_data$Flipped)

# Function to replace NA values with median for numeric columns
replace_na_with_median <- function(df) {
  numeric_cols <- sapply(df, is.numeric)  # Identify numeric columns
  df[numeric_cols] <- lapply(df[numeric_cols], function(x) {
    x[is.na(x)] <- median(x, na.rm = TRUE)  # Replace NA with median
    return(x)
  })
  return(df)
}

# Apply the function to training and testing datasets
train_data <- replace_na_with_median(train_data)
test_data <- replace_na_with_median(test_data)

Logistic Regression

# Logistic Regression Model
logistic_model <- glm(Flipped ~ Age + Gender + PrimaryInsuranceCategory + OU_LOS_hrs + 
                      DRG01 + BloodPressureUpper + BloodPressureLower + 
                      BloodPressureDiff + Pulse + PulseOximetry + Respirations + 
                      Temperature, 
                      data = train_data, family = "binomial",
                      control = glm.control(maxit = 500))

# Check for convergence issues
summary(logistic_model)
## 
## Call:
## glm(formula = Flipped ~ Age + Gender + PrimaryInsuranceCategory + 
##     OU_LOS_hrs + DRG01 + BloodPressureUpper + BloodPressureLower + 
##     BloodPressureDiff + Pulse + PulseOximetry + Respirations + 
##     Temperature, family = "binomial", data = train_data, control = glm.control(maxit = 500))
## 
## Coefficients:
##                                          Estimate Std. Error z value Pr(>|z|)
## (Intercept)                            -9.770e+00  9.011e+00  -1.084 0.278254
## Age                                    -1.598e-02  7.255e-03  -2.203 0.027616
## GenderMale                              4.078e-02  1.880e-01   0.217 0.828322
## PrimaryInsuranceCategoryMEDICAID STATE  5.253e-02  4.730e-01   0.111 0.911570
## PrimaryInsuranceCategoryMEDICARE        9.869e-01  3.874e-01   2.547 0.010861
## PrimaryInsuranceCategoryMEDICARE OTHER -3.222e-02  3.981e-01  -0.081 0.935497
## PrimaryInsuranceCategoryPrivate         1.226e-01  3.627e-01   0.338 0.735365
## OU_LOS_hrs                              3.034e-02  2.806e-03  10.814  < 2e-16
## DRG01428                                3.181e-01  4.903e-01   0.649 0.516505
## DRG01486                               -1.861e-01  4.572e-01  -0.407 0.683918
## DRG01558                                5.071e-01  5.890e-01   0.861 0.389285
## DRG01577                                8.257e-01  7.681e-01   1.075 0.282416
## DRG01578                               -3.596e-01  5.259e-01  -0.684 0.494041
## DRG01599                               -4.411e-01  4.261e-01  -1.035 0.300565
## DRG01780                               -1.590e+00  3.275e-01  -4.853 1.21e-06
## DRG01782                               -5.855e-01  7.420e-01  -0.789 0.430104
## DRG01786                               -1.287e+00  3.680e-01  -3.498 0.000469
## DRG01787                               -9.231e-01  4.143e-01  -2.228 0.025867
## DRG01789                               -1.004e+00  3.624e-01  -2.769 0.005618
## BloodPressureUpper                      3.103e-03  4.396e-03   0.706 0.480225
## BloodPressureLower                     -9.512e-03  7.293e-03  -1.304 0.192153
## BloodPressureDiff                       1.479e-03  4.460e-03   0.332 0.740105
## Pulse                                   4.220e-03  5.749e-03   0.734 0.463004
## PulseOximetry                           2.037e-06  3.436e-02   0.000 0.999953
## Respirations                           -4.794e-03  2.147e-02  -0.223 0.823319
## Temperature                             9.569e-02  8.605e-02   1.112 0.266116
##                                           
## (Intercept)                               
## Age                                    *  
## GenderMale                                
## PrimaryInsuranceCategoryMEDICAID STATE    
## PrimaryInsuranceCategoryMEDICARE       *  
## PrimaryInsuranceCategoryMEDICARE OTHER    
## PrimaryInsuranceCategoryPrivate           
## OU_LOS_hrs                             ***
## DRG01428                                  
## DRG01486                                  
## DRG01558                                  
## DRG01577                                  
## DRG01578                                  
## DRG01599                                  
## DRG01780                               ***
## DRG01782                                  
## DRG01786                               ***
## DRG01787                               *  
## DRG01789                               ** 
## BloodPressureUpper                        
## BloodPressureLower                        
## BloodPressureDiff                         
## Pulse                                     
## PulseOximetry                             
## Respirations                              
## Temperature                               
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1072.36  on 776  degrees of freedom
## Residual deviance:  785.04  on 751  degrees of freedom
## AIC: 837.04
## 
## Number of Fisher Scoring iterations: 5

Stepwise Regression

# Stepwise Regression for feature selection
step_model <- step(logistic_model, direction = "both")
## Start:  AIC=837.04
## Flipped ~ Age + Gender + PrimaryInsuranceCategory + OU_LOS_hrs + 
##     DRG01 + BloodPressureUpper + BloodPressureLower + BloodPressureDiff + 
##     Pulse + PulseOximetry + Respirations + Temperature
## 
##                            Df Deviance     AIC
## - PulseOximetry             1   785.04  835.04
## - Gender                    1   785.09  835.09
## - Respirations              1   785.09  835.09
## - BloodPressureDiff         1   785.15  835.15
## - BloodPressureUpper        1   785.54  835.54
## - Pulse                     1   785.58  835.58
## - Temperature               1   786.28  836.28
## - BloodPressureLower        1   786.74  836.74
## <none>                          785.04  837.04
## - Age                       1   789.95  839.95
## - PrimaryInsuranceCategory  4   810.35  854.35
## - DRG01                    11   846.04  876.04
## - OU_LOS_hrs                1   979.05 1029.05
## 
## Step:  AIC=835.04
## Flipped ~ Age + Gender + PrimaryInsuranceCategory + OU_LOS_hrs + 
##     DRG01 + BloodPressureUpper + BloodPressureLower + BloodPressureDiff + 
##     Pulse + Respirations + Temperature
## 
##                            Df Deviance     AIC
## - Gender                    1   785.09  833.09
## - Respirations              1   785.09  833.09
## - BloodPressureDiff         1   785.15  833.15
## - BloodPressureUpper        1   785.54  833.54
## - Pulse                     1   785.59  833.59
## - Temperature               1   786.28  834.28
## - BloodPressureLower        1   786.77  834.77
## <none>                          785.04  835.04
## + PulseOximetry             1   785.04  837.04
## - Age                       1   790.01  838.01
## - PrimaryInsuranceCategory  4   810.40  852.40
## - DRG01                    11   846.12  874.12
## - OU_LOS_hrs                1   979.06 1027.06
## 
## Step:  AIC=833.09
## Flipped ~ Age + PrimaryInsuranceCategory + OU_LOS_hrs + DRG01 + 
##     BloodPressureUpper + BloodPressureLower + BloodPressureDiff + 
##     Pulse + Respirations + Temperature
## 
##                            Df Deviance     AIC
## - Respirations              1   785.15  831.15
## - BloodPressureDiff         1   785.20  831.20
## - BloodPressureUpper        1   785.57  831.57
## - Pulse                     1   785.62  831.62
## - Temperature               1   786.38  832.38
## - BloodPressureLower        1   786.78  832.78
## <none>                          785.09  833.09
## + Gender                    1   785.04  835.04
## + PulseOximetry             1   785.09  835.09
## - Age                       1   790.15  836.15
## - PrimaryInsuranceCategory  4   810.48  850.48
## - DRG01                    11   846.13  872.13
## - OU_LOS_hrs                1   979.73 1025.73
## 
## Step:  AIC=831.15
## Flipped ~ Age + PrimaryInsuranceCategory + OU_LOS_hrs + DRG01 + 
##     BloodPressureUpper + BloodPressureLower + BloodPressureDiff + 
##     Pulse + Temperature
## 
##                            Df Deviance     AIC
## - BloodPressureDiff         1   785.27  829.27
## - BloodPressureUpper        1   785.60  829.60
## - Pulse                     1   785.64  829.64
## - Temperature               1   786.47  830.47
## - BloodPressureLower        1   786.88  830.88
## <none>                          785.15  831.15
## + Respirations              1   785.09  833.09
## + Gender                    1   785.09  833.09
## + PulseOximetry             1   785.15  833.15
## - Age                       1   790.22  834.22
## - PrimaryInsuranceCategory  4   810.52  848.52
## - DRG01                    11   846.13  870.13
## - OU_LOS_hrs                1   979.94 1023.94
## 
## Step:  AIC=829.27
## Flipped ~ Age + PrimaryInsuranceCategory + OU_LOS_hrs + DRG01 + 
##     BloodPressureUpper + BloodPressureLower + Pulse + Temperature
## 
##                            Df Deviance     AIC
## - BloodPressureUpper        1   785.69  827.69
## - Pulse                     1   785.80  827.80
## - Temperature               1   786.57  828.57
## - BloodPressureLower        1   786.96  828.96
## <none>                          785.27  829.27
## + BloodPressureDiff         1   785.15  831.15
## + Respirations              1   785.20  831.20
## + Gender                    1   785.21  831.21
## + PulseOximetry             1   785.27  831.27
## - Age                       1   790.43  832.43
## - PrimaryInsuranceCategory  4   810.66  846.66
## - DRG01                    11   846.30  868.30
## - OU_LOS_hrs                1   980.05 1022.05
## 
## Step:  AIC=827.69
## Flipped ~ Age + PrimaryInsuranceCategory + OU_LOS_hrs + DRG01 + 
##     BloodPressureLower + Pulse + Temperature
## 
##                            Df Deviance     AIC
## - Pulse                     1   786.14  826.14
## - BloodPressureLower        1   786.97  826.97
## - Temperature               1   787.08  827.08
## <none>                          785.69  827.69
## + BloodPressureUpper        1   785.27  829.27
## + BloodPressureDiff         1   785.60  829.60
## + Respirations              1   785.66  829.66
## + Gender                    1   785.66  829.66
## + PulseOximetry             1   785.69  829.69
## - Age                       1   790.61  830.61
## - PrimaryInsuranceCategory  4   811.13  845.13
## - DRG01                    11   846.30  866.30
## - OU_LOS_hrs                1   980.17 1020.17
## 
## Step:  AIC=826.14
## Flipped ~ Age + PrimaryInsuranceCategory + OU_LOS_hrs + DRG01 + 
##     BloodPressureLower + Temperature
## 
##                            Df Deviance     AIC
## - BloodPressureLower        1   787.19  825.19
## - Temperature               1   787.82  825.82
## <none>                          786.14  826.14
## + Pulse                     1   785.69  827.69
## + BloodPressureUpper        1   785.80  827.80
## + BloodPressureDiff         1   786.02  828.02
## + Gender                    1   786.12  828.12
## + PulseOximetry             1   786.13  828.13
## + Respirations              1   786.13  828.13
## - Age                       1   791.41  829.41
## - PrimaryInsuranceCategory  4   811.67  843.67
## - DRG01                    11   847.12  865.12
## - OU_LOS_hrs                1   980.17 1018.17
## 
## Step:  AIC=825.19
## Flipped ~ Age + PrimaryInsuranceCategory + OU_LOS_hrs + DRG01 + 
##     Temperature
## 
##                            Df Deviance     AIC
## - Temperature               1   788.83  824.83
## <none>                          787.19  825.19
## + BloodPressureLower        1   786.14  826.14
## + Pulse                     1   786.97  826.97
## + BloodPressureDiff         1   787.08  827.08
## + Respirations              1   787.12  827.12
## + PulseOximetry             1   787.15  827.15
## + Gender                    1   787.18  827.18
## + BloodPressureUpper        1   787.19  827.19
## - Age                       1   792.38  828.38
## - PrimaryInsuranceCategory  4   812.39  842.39
## - DRG01                    11   849.12  865.12
## - OU_LOS_hrs                1   980.98 1016.98
## 
## Step:  AIC=824.83
## Flipped ~ Age + PrimaryInsuranceCategory + OU_LOS_hrs + DRG01
## 
##                            Df Deviance     AIC
## <none>                          788.83  824.83
## + Temperature               1   787.19  825.19
## + BloodPressureLower        1   787.82  825.82
## + Pulse                     1   788.39  826.39
## + BloodPressureDiff         1   788.74  826.74
## + Respirations              1   788.75  826.75
## + Gender                    1   788.78  826.78
## + PulseOximetry             1   788.78  826.78
## + BloodPressureUpper        1   788.83  826.83
## - Age                       1   793.95  827.95
## - PrimaryInsuranceCategory  4   814.02  842.02
## - DRG01                    11   850.36  864.36
## - OU_LOS_hrs                1   982.86 1016.86
summary(step_model)
## 
## Call:
## glm(formula = Flipped ~ Age + PrimaryInsuranceCategory + OU_LOS_hrs + 
##     DRG01, family = "binomial", data = train_data, control = glm.control(maxit = 500))
## 
## Coefficients:
##                                         Estimate Std. Error z value Pr(>|z|)
## (Intercept)                            -0.309862   0.519741  -0.596 0.551052
## Age                                    -0.015938   0.007087  -2.249 0.024519
## PrimaryInsuranceCategoryMEDICAID STATE  0.075414   0.465948   0.162 0.871423
## PrimaryInsuranceCategoryMEDICARE        0.980352   0.385009   2.546 0.010887
## PrimaryInsuranceCategoryMEDICARE OTHER -0.028396   0.395753  -0.072 0.942799
## PrimaryInsuranceCategoryPrivate         0.118560   0.360942   0.328 0.742553
## OU_LOS_hrs                              0.030243   0.002792  10.832  < 2e-16
## DRG01428                                0.276947   0.480179   0.577 0.564103
## DRG01486                               -0.127735   0.449622  -0.284 0.776338
## DRG01558                                0.504422   0.586525   0.860 0.389780
## DRG01577                                0.747287   0.767393   0.974 0.330156
## DRG01578                               -0.302138   0.520587  -0.580 0.561659
## DRG01599                               -0.390473   0.420089  -0.930 0.352629
## DRG01780                               -1.570000   0.322929  -4.862 1.16e-06
## DRG01782                               -0.650213   0.734101  -0.886 0.375764
## DRG01786                               -1.299789   0.362122  -3.589 0.000331
## DRG01787                               -0.929946   0.411921  -2.258 0.023972
## DRG01789                               -1.035670   0.360072  -2.876 0.004024
##                                           
## (Intercept)                               
## Age                                    *  
## PrimaryInsuranceCategoryMEDICAID STATE    
## PrimaryInsuranceCategoryMEDICARE       *  
## PrimaryInsuranceCategoryMEDICARE OTHER    
## PrimaryInsuranceCategoryPrivate           
## OU_LOS_hrs                             ***
## DRG01428                                  
## DRG01486                                  
## DRG01558                                  
## DRG01577                                  
## DRG01578                                  
## DRG01599                                  
## DRG01780                               ***
## DRG01782                                  
## DRG01786                               ***
## DRG01787                               *  
## DRG01789                               ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1072.36  on 776  degrees of freedom
## Residual deviance:  788.83  on 759  degrees of freedom
## AIC: 824.83
## 
## Number of Fisher Scoring iterations: 5
# Predict probabilities on test data
logistic_pred <- predict(step_model, test_data, type = "response")

# Find optimal cutoff for classification
cutoffs <- seq(0, 1, 0.1)
accuracy <- sapply(cutoffs, function(cut) {
  predicted_class <- ifelse(logistic_pred > cut, 1, 0)
  
  # Ensure both predicted and actual classes have the same factor levels
  predicted_class <- factor(predicted_class, levels = c(0, 1))
  actual_class <- factor(test_data$Flipped, levels = c(0, 1))
  
  # Calculate accuracy
  confusionMatrix(predicted_class, actual_class)$overall["Accuracy"]
})

# Print accuracy for each cutoff
data.frame(Cutoff = cutoffs, Accuracy = accuracy)

Plot accuracy vs cutoff

# Plot accuracy vs cutoff
plot(cutoffs, accuracy, type = "b", col = "blue", xlab = "Cutoff", ylab = "Accuracy",
     main = "Accuracy vs Cutoff for Logistic Regression")
abline(v = cutoffs[which.max(accuracy)], col = "red", lty = 2)
legend("topright", legend = paste("Best Cutoff =", cutoffs[which.max(accuracy)]), col = "red", lty = 2)

Confusion matrix at best cutoff

best_cutoff <- cutoffs[which.max(accuracy)]
logistic_class <- ifelse(logistic_pred > best_cutoff, 1, 0)
conf_matrix_logistic <- confusionMatrix(factor(logistic_class), factor(test_data$Flipped))
print(conf_matrix_logistic)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 151  38
##          1  29 116
##                                          
##                Accuracy : 0.7994         
##                  95% CI : (0.7524, 0.841)
##     No Information Rate : 0.5389         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.5946         
##                                          
##  Mcnemar's Test P-Value : 0.3284         
##                                          
##             Sensitivity : 0.8389         
##             Specificity : 0.7532         
##          Pos Pred Value : 0.7989         
##          Neg Pred Value : 0.8000         
##              Prevalence : 0.5389         
##          Detection Rate : 0.4521         
##    Detection Prevalence : 0.5659         
##       Balanced Accuracy : 0.7961         
##                                          
##        'Positive' Class : 0              
## 

ROC Curve for Logistic Regression

library(pROC)  # Ensure the pROC package is installed and loaded

# Calculate ROC curve
roc_logistic <- roc(test_data$Flipped, logistic_pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot ROC curve with enhancements
plot(roc_logistic, 
     main = "ROC Curve for Logistic Regression", 
     col = "#1c61b6",  # Custom color for the ROC curve
     lwd = 2,          # Thicker line for better visibility
     print.auc = TRUE, # Display AUC value on the plot
     auc.polygon = TRUE, # Fill the area under the curve
     auc.polygon.col = "#1c61b622", # Semi-transparent fill color
     print.thres = "best", # Highlight the optimal threshold
     print.thres.col = "red", # Color for the optimal threshold
     print.thres.adj = c(1.2, 1.2), # Adjust position of the threshold label
     grid = TRUE,      # Add a grid for better readability
     legacy.axes = TRUE) # Use 0-1 for the x-axis (False Positive Rate)

auc_logistic <- auc(roc_logistic)
cat("AUC for Logistic Regression:", auc_logistic, "\n")
## AUC for Logistic Regression: 0.8506854

Classification tree

# Load required libraries
library(rpart)       # For building the classification tree
library(rpart.plot)  # For visualizing the tree
library(caret)       # For confusion matrix

# Build the classification tree
default.tree <- rpart(Flipped ~ Age + PrimaryInsuranceCategory + OU_LOS_hrs + DRG01, 
                      data = train_data, method = "class")

# Predict on the test data
default.tree.pred <- predict(default.tree, test_data, type = "class")

# Visualize the tree
rpart.plot(default.tree, 
           type = 4,          # Display full tree structure
           extra = 104,       # Show probabilities and percentages
           box.palette = "Blues",  # Use a color palette
           shadow.col = "gray",    # Add shadows for depth
           nn = TRUE,         # Show node numbers
           fallen.leaves = TRUE,  # Align leaves at the bottom
           main = "Classification Tree")

# Confusion Matrix
cfm <- confusionMatrix(default.tree.pred, factor(test_data$Flipped))
print(cfm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 140  30
##          1  40 124
##                                           
##                Accuracy : 0.7904          
##                  95% CI : (0.7428, 0.8328)
##     No Information Rate : 0.5389          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.5803          
##                                           
##  Mcnemar's Test P-Value : 0.2821          
##                                           
##             Sensitivity : 0.7778          
##             Specificity : 0.8052          
##          Pos Pred Value : 0.8235          
##          Neg Pred Value : 0.7561          
##              Prevalence : 0.5389          
##          Detection Rate : 0.4192          
##    Detection Prevalence : 0.5090          
##       Balanced Accuracy : 0.7915          
##                                           
##        'Positive' Class : 0               
## 
# Plot confusion matrix with custom colors
plot(cfm$table, 
     col = c("green", "yellow"),  # Custom colors for the confusion matrix
     main = "Confusion Matrix", 
     xlab = "Actual", 
     ylab = "Predicted")

# Advanced Confusion Matrix Plot using ggplot2
cfm_table <- as.data.frame(cfm$table)
ggplot(cfm_table, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Freq), color = "black", size = 6) +
  scale_fill_gradient(low = "white", high = "navy") +
  labs(title = "Confusion Matrix", 
       x = "Actual", 
       y = "Predicted", 
       fill = "Frequency") +
  theme_minimal() +
  theme(axis.text = element_text(size = 12),
        axis.title = element_text(size = 14),
        plot.title = element_text(size = 16, face = "bold"))

# Calculate additional performance metrics
precision <- cfm$byClass["Pos Pred Value"]
recall <- cfm$byClass["Sensitivity"]
f1_score <- cfm$byClass["F1"]
cat("Precision:", precision, "\n")
## Precision: 0.8235294
cat("Recall:", recall, "\n")
## Recall: 0.7777778
cat("F1-Score:", f1_score, "\n")
## F1-Score: 0.8
# ROC curve
library(ROCR)
predict.tree.prob = predict(default.tree, newdata = test_data, type = "prob")

# plot ROC and compute AUC
library(pROC)
ROC.tree = roc(response = test_data$Flipped, predictor = predict.tree.prob[,2])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(ROC.tree, print.thres = c(0.5))

auc = ROC.tree$auc
auc
## Area under the curve: 0.816

Random Forest

library(randomForest)
library(caret)
library(ggplot2)

# Ensure 'Flipped' is a factor for classification
train_data$Flipped <- as.factor(train_data$Flipped)
test_data$Flipped <- as.factor(test_data$Flipped)

# Tune 'mtry' parameter for optimal model performance
best_mtry <- tuneRF(train_data[, c("Age", "PrimaryInsuranceCategory", "OU_LOS_hrs", "DRG01")], 
                    train_data$Flipped, 
                    stepFactor = 1.5, 
                    improve = 0.01, 
                    ntreeTry = 100, 
                    trace = FALSE, 
                    plot = FALSE)
## -0.02369668 0.01
optimal_mtry <- best_mtry[which.min(best_mtry[, 2]), 1]  # Extract best 'mtry' value

# Train Random Forest Model with Optimal Parameters
rf <- randomForest(Flipped ~ Age + PrimaryInsuranceCategory + OU_LOS_hrs + DRG01, 
                   data = train_data, 
                   ntree = 500, 
                   mtry = optimal_mtry, 
                   nodesize = 5, 
                   importance = TRUE)

# Improved Variable Importance Plot
varImpPlot(rf, type = 1, main = "Feature Importance", col = "blue")

# Predictions on Test Set
rf.pred <- predict(rf, test_data)

# Enhanced Confusion Matrix Output
conf_matrix <- confusionMatrix(factor(rf.pred), factor(test_data$Flipped))
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 144  35
##          1  36 119
##                                           
##                Accuracy : 0.7874          
##                  95% CI : (0.7396, 0.8301)
##     No Information Rate : 0.5389          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.5725          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.8000          
##             Specificity : 0.7727          
##          Pos Pred Value : 0.8045          
##          Neg Pred Value : 0.7677          
##              Prevalence : 0.5389          
##          Detection Rate : 0.4311          
##    Detection Prevalence : 0.5359          
##       Balanced Accuracy : 0.7864          
##                                           
##        'Positive' Class : 0               
## 
# Visualize Confusion Matrix with ggplot
conf_matrix_df <- as.data.frame(conf_matrix$table)
ggplot(conf_matrix_df, aes(x = Reference, y = Prediction, fill = Freq)) + 
  geom_tile(color = "white") + 
  geom_text(aes(label = Freq), vjust = 1, size = 5) + 
  scale_fill_gradient(low = "white", high = "blue") + 
  labs(title = "Confusion Matrix - Random Forest", x = "Actual", y = "Predicted") +
  theme_minimal()

## Naive Bayes

library(caret)
library(e1071)

# Running Naive Bayes
naive <- naiveBayes(Flipped ~., data = train_data)

# Predicting probabilities 
pred.naive.prob <- predict(naive, newdata = test_data, type = "raw")

# Predicting class membership
pred.naive.class <- predict(naive, newdata = test_data)

Confusion matrix test_data

#Confusion matrix
confusionMatrix(factor(pred.naive.class), factor(test_data$Flipped))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 179   1
##          1   1 153
##                                           
##                Accuracy : 0.994           
##                  95% CI : (0.9785, 0.9993)
##     No Information Rate : 0.5389          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.988           
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9944          
##             Specificity : 0.9935          
##          Pos Pred Value : 0.9944          
##          Neg Pred Value : 0.9935          
##              Prevalence : 0.5389          
##          Detection Rate : 0.5359          
##    Detection Prevalence : 0.5389          
##       Balanced Accuracy : 0.9940          
##                                           
##        'Positive' Class : 0               
## 

CLASSIFICATION TREE TEST RESULTS to Excel

# library(openxlsx)
# write.xlsx(cbind(test_data, predict.tree.prob) , file = "test_result.xlsx")