Final Project: Predictive Factors and Model Comparison for Medicare Procedure Cost Estimation

Loading Packages

library("data.table")

## Warning: package 'data.table' was built under R version 4.2.3

library("dplyr")

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library("tidyverse")

## Warning: package 'tidyverse' was built under R version 4.2.3

## Warning: package 'ggplot2' was built under R version 4.2.3

## Warning: package 'stringr' was built under R version 4.2.3

## Warning: package 'forcats' was built under R version 4.2.3

## Warning: package 'lubridate' was built under R version 4.2.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.2
## ✔ ggplot2   3.4.1     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.1.8
## ✔ purrr     0.3.4     ✔ tidyr     1.2.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between()     masks data.table::between()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ dplyr::first()       masks data.table::first()
## ✖ lubridate::hour()    masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ dplyr::last()        masks data.table::last()
## ✖ lubridate::mday()    masks data.table::mday()
## ✖ lubridate::minute()  masks data.table::minute()
## ✖ lubridate::month()   masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second()  masks data.table::second()
## ✖ purrr::transpose()   masks data.table::transpose()
## ✖ lubridate::wday()    masks data.table::wday()
## ✖ lubridate::week()    masks data.table::week()
## ✖ lubridate::yday()    masks data.table::yday()
## ✖ lubridate::year()    masks data.table::year()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

library("maps")

## 
## Attaching package: 'maps'
## 
## The following object is masked from 'package:purrr':
## 
##     map

library("ggplot2")
library("nnet")

## Warning: package 'nnet' was built under R version 4.2.3

library("rpart")
library("RColorBrewer")
#library("fastDummies")

This report details the process of loading, cleaning, and analyzing data from three datasets: Hospital general information, Census data, and Medicare charges. The goal is to explore the distribution of average total payment amounts for Acute Myocardial Infarction (AMI) with MS-DRGs 280-282.

Loading data The datasets are loaded using the fread function from the data.table package. Three data frames are created: HospitalRating, CensusData, and MedicareCharges.

HospitalRating<-fread("Hospital general information.csv")
CensusData<-fread("Census.csv", header = T)
MedicareCharges<-fread("Medicare.csv")

Variable selection and data cleaning The datasets are loaded using the fread function from the data.table package. Three data frames are created: HospitalRating, CensusData, and MedicareCharges.

#Selecting Variables
HospitalRating_filtered<- HospitalRating %>% select(`Hospital Type`, `Hospital Ownership`, `Hospital overall rating`, `Provider ID`)
CensusData_filtered<- CensusData %>% select(NAME, B16010_042E, B16010_048E,B16010_043E, B16010_053E)
# Selecting Variables and renaming them
CensusData_filtered <- CensusData %>% 
  select(NAME, educated_employed = B16010_042E, educated_not_employed = B16010_048E, 
         speak_english = B16010_043E, non_english = B16010_053E)

joining datasets

#Joining Data by Provider ID for Medical Charges data and the Hospital Rating Data
joined_data <- inner_join(MedicareCharges, HospitalRating_filtered, by = c("Rndrng_Prvdr_CCN" = "Provider ID" ))

# Filter and separate the NAME column to have a separate ZIP code for combining
CensusData_filtered1 <- data.frame(CensusData_filtered %>% mutate(NAME = str_trim(NAME)) %>%
  separate(NAME, into = c("name1", "name2"), sep = " ", remove = FALSE))
#Converting the ZIP code variable (Rndrng_Prvdr_Zip5) to Character
joined_data$Rndrng_Prvdr_Zip5<- as.character(joined_data$Rndrng_Prvdr_Zip5)


#Joining the data sets, successfully joining the three data sets by specific variables (Here, by ZIP Code)
FullData <- inner_join(CensusData_filtered1, joined_data, by = c("name2" = "Rndrng_Prvdr_Zip5" ))

#Removing some variables from the data set and Renaming the Zip Code Variable for easy reference
FullData <- FullData %>%
  select(-Rndrng_Prvdr_City, -Rndrng_Prvdr_St, -Rndrng_Prvdr_Org_Name, -name1, -NAME, -Rndrng_Prvdr_State_FIPS, -Rndrng_Prvdr_RUCA_Desc, -Rndrng_Prvdr_RUCA, -Avg_Submtd_Cvrd_Chrg, -Avg_Mdcr_Pymt_Amt, -Rndrng_Prvdr_CCN) %>% rename(ZIP_Code = name2)


# Create a data frame with state abbreviations and names
state_df <- data.frame(state.abb, state.name)
colnames(state_df) <- c("state_abbrv", "state_name")

# Merge the state names with your dataset using the state abbreviation
FullData <- merge(FullData, state_df, by.x = "Rndrng_Prvdr_State_Abrvtn", by.y = "state_abbrv", all.x = TRUE)

Analysing Acute myocardial infarction (AMI) Data Creating a histogram that shows the distribution of average total payment amounts for Acute Myocardial Infarction(AMI) with MS-DRGs 280-282 across states.

#Filtering the Data for AMI MS-DRGs 280-282
FullData_AMI <- FullData %>% filter(DRG_Cd >= 280 & DRG_Cd <= 282)

library(ggplot2)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

#Filtering the Data for AMI MS-DRGs 280-282
FullData_AMI <- FullData %>% filter(DRG_Cd >= 280 & DRG_Cd <= 282)

p <- ggplot(FullData_AMI, aes(x = Avg_Tot_Pymt_Amt, fill = state_name)) +
  geom_histogram(bins = 50, col = "white") +
  labs(title = "Distribution of Avg_Tot_Pymt_Amt for AMI MS-DRGs 280-282",
       x = "Average Total Payment Amount",
       y = "Count") +
  theme(plot.title = element_text(hjust = 0.5),
        plot.background = element_rect(fill = "white"),
        panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(colour = "gray", size = 0.5))

## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.

Creating a Density map for the US showing various costs per State: This code chunk filters the data for AMI MS-DRGs 280-282 and creates a density plot to visualize the distribution of average total payment amounts for each state.

# Filtering the Data for AMI MS-DRGs 280-282
FullData_AMI <- FullData %>% filter(DRG_Cd >= 280 & DRG_Cd <= 282)

# Density plot
p<-ggplot(FullData_AMI, aes(x = Avg_Tot_Pymt_Amt, y = stat(density), fill = state_name)) +
  geom_density(alpha = 0.5) +
  labs(title = "Density Plot of Avg_Tot_Pymt_Amt for AMI MS-DRGs 280-282",
       x = "Average Total Payment Amount",
       y = "Density") +
  theme(plot.title = element_text(hjust = 0.5))
ggplotly(p)

## Warning: `stat(density)` was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## ℹ The deprecated feature was likely used in the ggplot2 package.
##   Please report the issue at <]8;;https://github.com/tidyverse/ggplot2/issueshttps://github.com/tidyverse/ggplot2/issues]8;;>.

Creating an interactive heatmap to show Avg_Tot_Pymt_Amt per state: This code chunk calculates the mean average total payment amount for AMI per state and creates an interactive heatmap to visualize these values.

state_data <- FullData_AMI %>% 
  group_by(state_name) %>% 
  summarize(avg_amt = mean(Avg_Tot_Pymt_Amt))

# Create the ggplot object
heatmap_plot <- ggplot(state_data, aes(x = state_name, y = 1, fill = avg_amt)) +
  geom_tile() +
  scale_fill_gradient2(low = "#F7FBFF", mid = "#6BAED6", high = "#08306B", midpoint = 0) +
  ggtitle("Average Total Payment Amount for Acute Myocardial Infarction by State") +
  xlab("") +
  ylab("") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 10),
        axis.ticks.x = element_blank(),
        axis.line = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank())

# Convert the ggplot object to a plotly object and display it
interactive_heatmap <- ggplotly(heatmap_plot)
interactive_heatmap

Histogram Plot of Avg_Tot_Pymt_Amt for AMI MS-DRGs 280-282: This code chunk creates a histogram to visualize the distribution of average total payment amounts for AMI MS-DRGs 280-282.

FullData_AMI <- FullData %>% filter(DRG_Cd >= 280 & DRG_Cd <= 282)
p <- ggplot(FullData_AMI, aes(x = Avg_Tot_Pymt_Amt, fill = state_name)) +
  geom_histogram(bins = 100, col = "white", binwidth = NULL, position='stack') +
  labs(title = "Avg_Tot_Pymt_Amt for AMI MS-DRGs 280-282",
       x = "Average Total Payment Amount",
       y = "Count") +
  theme(plot.title = element_text(hjust = 0.5, size = 10),
        plot.background = element_rect(fill = "white"),
        panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(colour = "gray", size = 0.9),
        axis.title = element_text(size = 16),
        axis.text = element_text(size = 14),
        legend.title = element_text(size = 14),
        legend.text = element_text(size = 12))

ggplotly(p)

Data wrangling and variable selection: This code chunk selects relevant variables for further analysis, converts character variables to numeric, and removes variables with only one level.

# Select relevant variables for the analysis
selected_data <- FullData_AMI %>%
  select(Avg_Tot_Pymt_Amt, `Hospital Type`, `Hospital Ownership`, `Hospital overall rating`, educated_employed, educated_not_employed, speak_english, non_english,DRG_Cd)

# Convert character variables to numeric
selected_data$educated_employed <- as.numeric(selected_data$educated_employed)
selected_data$educated_not_employed <- as.numeric(selected_data$educated_not_employed)
selected_data$speak_english <- as.numeric(selected_data$speak_english)
selected_data$non_english <- as.numeric(selected_data$non_english)
# Identify factor variables with only one level
single_level_factors <- sapply(selected_data, function(x) is.factor(x) && length(levels(x)) == 1)

# Remove those variables from the data
selected_data <- selected_data[!single_level_factors]

Splitting data into training and test: The data is split into a training set (70%) and a testing set (30%) to evaluate the performance of the linear regression and regression tree models. for the variables with DRG_Cd 280,281,282

AMI_data <- selected_data %>% 
  filter(DRG_Cd %in% c('280', '281','282'))
library(rsample)

## Warning: package 'rsample' was built under R version 4.2.3

# splitting the data into training and testing sets (AMI)
set.seed(1526)
AMI_split <- initial_split(AMI_data, prop = 0.70)
train_data <- training(AMI_split)
test_data  <- testing(AMI_split)

Creating the Linear Regression Model: Linear Regression Model: This code chunk fits a linear regression model to the selected data and calculates the adjusted R2 and root mean-square error (RMSE) to evaluate the model’s performance. The predicted values are plotted against the actual values to visualize the model’s accuracy.

# Select relevant variables
selected_data <- FullData_AMI %>%
  select('Rndrng_Prvdr_State_Abrvtn','DRG_Cd','Avg_Tot_Pymt_Amt', 'Hospital Type', 'Hospital Ownership', 'Hospital overall rating',
         'educated_employed','educated_not_employed','Tot_Dschrgs')

# Convert categorical variables to factors
selected_data$'Hospital Type' <- as.factor(selected_data$'Hospital Type')
selected_data$'Hospital Ownership' <- as.factor(selected_data$'Hospital Ownership')

# Remove factor variables with less than 2 levels
selected_data <- selected_data %>%
  select_if(~ !is.factor(.) || nlevels(.) >= 2)


# Fit the linear regression model
lin_reg_model <- lm(Avg_Tot_Pymt_Amt ~ ., data = selected_data)

# Calculate adjusted R2 and root mean-square error
lin_reg_adj_r2 <- summary(lin_reg_model)$adj.r.squared
lin_reg_rmse <- sqrt(mean(lin_reg_model$residuals^2))

cat("Adjusted R2 for Linear Regression Model:", lin_reg_adj_r2, "\n")

## Adjusted R2 for Linear Regression Model: 0.8382889

cat("Root Mean-Square Error for Linear Regression Model:", lin_reg_rmse, "\n")

## Root Mean-Square Error for Linear Regression Model: 1097.179

# Predict the values using the linear regression model
predictions_lin_reg <- predict(lin_reg_model, selected_data)

## Warning in predict.lm(lin_reg_model, selected_data): prediction from a
## rank-deficient fit may be misleading

# Plot the predicted values against the actual values
plot(selected_data$Avg_Tot_Pymt_Amt, predictions_lin_reg, 
     xlab = "Actual Values", ylab = "Predicted Values",
     main = "Linear Regression Model: Actual vs. Predicted Values")
abline(0, 1, col = "red", lwd = 2)

# Add the adjusted R2 and RMSE to the plot
text(x = min(selected_data$Avg_Tot_Pymt_Amt), y = max(predictions_lin_reg),
     labels = paste("Adjusted R2 (Linear Regression):", round(lin_reg_adj_r2, 4), "\n",
                    "Root Mean-Square Error (Linear Regression):", round(lin_reg_rmse, 4)),
     pos = 4)

Creating the Regression Trees (CART) model: Regression Trees (CART) model: This code chunk fits a regression tree model to the training data and predicts the test data. The R2, adjusted R2, and RMSE are calculated to evaluate the model’s performance. The predicted values are plotted against the actual values to visualize the model’s accuracy.

library(rpart)

# Fit the regression tree model
tree <- rpart(Avg_Tot_Pymt_Amt ~ ., data = train_data)

# Predict the test data
predictions_tree <- predict(tree, test_data)

# Calculate the adjusted R2 and root mean-square error
# Calculate the R2 and adjusted R2
lm_tree <- lm(test_data$Avg_Tot_Pymt_Amt ~ predictions_tree)
r2_tree <- summary(lm_tree)$r.squared
adj_r2_tree <- summary(lm_tree)$adj.r.squared

# Calculate the root mean-square error
rmse_tree <- sqrt(mean((test_data$Avg_Tot_Pymt_Amt - predictions_tree)^2))

cat("R2 (Regression Tree):", r2_tree, "\n")

## R2 (Regression Tree): 0.5365508

cat("Adjusted R2 (Regression Tree):", adj_r2_tree, "\n")

## Adjusted R2 (Regression Tree): 0.5361152

cat("Root Mean-Square Error (Regression Tree):", rmse_tree, "\n")

## Root Mean-Square Error (Regression Tree): 2535.656

# Plot the predicted values against the actual values
plot(test_data$Avg_Tot_Pymt_Amt, predictions_tree, 
     xlab = "Actual Values", ylab = "Predicted Values",
     main = "Regression Tree Model: Actual vs. Predicted Values")
abline(0, 1, col = "red", lwd = 2)

# Add the R2, adjusted R2, and RMSE to the plot
text(x = min(test_data$Avg_Tot_Pymt_Amt), y = max(predictions_tree),
     labels = paste("R2 (Regression Tree):", round(r2_tree, 4), "\n",
                    "Adjusted R2 (Regression Tree):", round(adj_r2_tree, 4), "\n",
                    "Root Mean-Square Error (Regression Tree):", round(rmse_tree, 4)),
     pos = 4)

Create the Feedforward ANN model with 5 hidden layers

# Install and load the neuralnet package
if (!require(neuralnet)) {
  install.packages("neuralnet")
  library(neuralnet)
}

## Loading required package: neuralnet

## Warning: package 'neuralnet' was built under R version 4.2.3

## 
## Attaching package: 'neuralnet'

## The following object is masked from 'package:dplyr':
## 
##     compute

# Identify non-numeric variables
non_numeric_vars <- sapply(train_data, function(x) !is.numeric(x))

# Convert non-numeric variables to numeric variables if possible
train_data[, non_numeric_vars] <- lapply(train_data[, non_numeric_vars], function(x) if (!any(is.na(as.numeric(x)))) as.numeric(x) else x)

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

test_data[, non_numeric_vars] <- lapply(test_data[, non_numeric_vars], function(x) if (!any(is.na(as.numeric(x)))) as.numeric(x) else x)

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

# Remove remaining non-numeric variables
train_data <- train_data[, sapply(train_data, is.numeric)]
test_data <- test_data[, sapply(test_data, is.numeric)]

# Normalize the data
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}

train_data_norm <- as.data.frame(lapply(train_data, normalize))
test_data_norm <- as.data.frame(lapply(test_data, normalize))

# Fit the ANN model with 2 hidden layers
set.seed(123)
ann <- neuralnet(Avg_Tot_Pymt_Amt ~ ., data = train_data_norm, hidden = c(5, 5), linear.output = TRUE)

# Predict the test data
predictions_ann <- predict(ann, test_data_norm)

# Re-scale the predictions
predictions_ann <- predictions_ann * (max(test_data$Avg_Tot_Pymt_Amt) - min(test_data$Avg_Tot_Pymt_Amt)) + min(test_data$Avg_Tot_Pymt_Amt)

# Calculate the adjusted R2 and root mean-square error
# Calculate the adjusted R2 and root mean-square error
lm_ann <- lm(test_data$Avg_Tot_Pymt_Amt ~ predictions_ann)
adj_r2_ann <- summary(lm_ann)$adj.r.squared
rmse_ann <- sqrt(mean((test_data$Avg_Tot_Pymt_Amt - predictions_ann)^2))

cat("Adjusted R2 (ANN):", adj_r2_ann, "\n")

## Adjusted R2 (ANN): 0.442596

cat("Root Mean-Square Error (ANN):", rmse_ann, "\n")

## Root Mean-Square Error (ANN): 3472.15

# Plot the predicted values against the actual values
plot(test_data$Avg_Tot_Pymt_Amt, predictions_ann, 
     xlab = "Actual Values", ylab = "Predicted Values",
     main = "Artificial Neural Network Model: Actual vs. Predicted Values")
abline(0, 1, col = "red", lwd = 2)

# Add the adjusted R2 and RMSE to the plot
text(x = min(test_data$Avg_Tot_Pymt_Amt), y = max(predictions_ann),
     labels = paste("Adjusted R2 (ANN):", round(adj_r2_ann, 4), "\n",
                    "Root Mean-Square Error (ANN):", round(rmse_ann, 4)),
     pos = 4)

compare the performance of each model: Linear regression had the highest value of R2 therefore it performed well in predicting average cost for AMI.

# Compare performance of the three models
performance <- data.frame(
  Model = c("Linear Regression", "Regression Tree", "Artificial Neural Network"),
  Adjusted_R2 = c(lin_reg_adj_r2, adj_r2_tree, adj_r2_ann),
  RMSE = c(lin_reg_rmse, rmse_tree, rmse_ann)
)

performance <- performance %>%
  arrange(desc(Adjusted_R2), RMSE) %>%
  mutate(Rank = row_number())

cat("Comparison of Model Performance:\n")

## Comparison of Model Performance:

print(performance)

##                       Model Adjusted_R2     RMSE Rank
## 1         Linear Regression   0.8382889 1097.179    1
## 2           Regression Tree   0.5361152 2535.656    2
## 3 Artificial Neural Network   0.4425960 3472.150    3

Comparing Model Performace for Regression Tree, ANN and Linear regression

# Plot the performance comparison
library(ggplot2)

# Adjusted R2 comparison
ggplot(performance, aes(x = Model, y = Adjusted_R2, fill = Model)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Adjusted R2 Comparison",
       x = "Model",
       y = "Adjusted R2") +
  theme(legend.position = "none")

# RMSE comparison
ggplot(performance, aes(x = Model, y = RMSE, fill = Model)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Root Mean-Square Error Comparison",
       x = "Model",
       y = "RMSE") +
  theme(legend.position = "none")

Based on the R-squared and RMSE values obtained for the three models, the Linear Regression model performed the best in predicting the average cost for healthcare for Acute Myocardial Infarction (AMI). The Linear Regression model had the highest R-squared value of 0.8382889 and the lowest RMSE value of 1097.179, indicating that it explains more of the variance in the data and has better accuracy in predicting the average cost compared to the other two models. The Regression Tree model ranked second, with an R-squared value of 0.5361152 and an RMSE value of 2535.656. Although this model performed better than the Artificial Neural Network model, it was not as accurate as the Linear Regression model. The Artificial Neural Network model had the lowest performance among the three models, with an R-squared value of 0.4425960 and an RMSE value of 3472.150. This suggests that the ANN model might not be the best choice for predicting the average cost for healthcare for AMI in this case. In conclusion, the Linear Regression model is the most suitable model for accurately predicting the average cost of healthcare for AMI patients in the Medicare system. This model takes into account factors such as hospital rating, employment, and English language proficiency, which were found to be significant predictors of the average cost of treatment. Healthcare providers and policymakers can use this model to better understand the factors influencing healthcare costs and to make more informed decisions about resource allocation and cost management.

Surgical hip/femur fracture treatment (SHFFT)

#Filtering the Data for Surgical hip/femur fracture treatment (SHFFT)
FullData_SHFFT <- FullData %>% filter(DRG_Cd >= 480 & DRG_Cd <= 482)

# Calculate average cost per state
FullData_SHFFT <- FullData_SHFFT %>% group_by(Rndrng_Prvdr_State_Abrvtn)

Creating a histogram showing Avg_Tot_Pymt_Amt for Surgical hip/femur fracture treatment (SHFFT)

library(plotly)

# Filter data for SHFFT
FullData_SHFFT <- FullData %>% filter(DRG_Cd >= 480 & DRG_Cd <= 482)

# Create histogram of average total payment amount for SHFFT
histogram <- ggplot(FullData_SHFFT, aes(x = Avg_Tot_Pymt_Amt, fill = Rndrng_Prvdr_State_Abrvtn)) +
  geom_histogram(binwidth = 5000, color = "black") +
  labs(title = "Distribution of Average Total Payment Amount for Surgical Hip/Femur Fracture Treatment",
       x = "Average Total Payment Amount",
       y = "Count",
       fill = "State") +
  scale_fill_discrete(na.translate = FALSE) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 10),
        axis.title = element_text(face = "bold", size = 14),
        legend.title = element_text(face = "bold", size = 14),
        legend.position = "right",
        legend.box.spacing = unit(0.2, "cm"),
        legend.margin = margin(t = 0, b = 0, l = 0, r = 20))

# Make the histogram interactive using plotly
ggplotly(histogram, tooltip = c("fill", "x", "y")) %>%
  layout(title = list(text = "Distribution of Average Total Payment Amount for Surgical Hip/Femur Fracture Treatment",
                       x = 0.5),
         xaxis = list(title = "Average Total Payment Amount"),
         yaxis = list(title = "Count"))

Data wrangling and variable selection

library(dplyr)

# Convert character columns to numeric
FullData_SHFFT$educated_employed <- as.numeric(FullData_SHFFT$educated_employed)
FullData_SHFFT$educated_not_employed <- as.numeric(FullData_SHFFT$educated_not_employed)
FullData_SHFFT$speak_english <- as.numeric(FullData_SHFFT$speak_english)
FullData_SHFFT$non_english <- as.numeric(FullData_SHFFT$non_english)
FullData_SHFFT$`Hospital overall rating` <- as.numeric(FullData_SHFFT$`Hospital overall rating`)

# Select relevant variables
selected_data_SHIFT <- FullData_SHFFT %>% select(Avg_Tot_Pymt_Amt, educated_employed, educated_not_employed, speak_english, non_english, Tot_Dschrgs, `Hospital Type`, `Hospital overall rating`, DRG_Cd,state_name)

Splitting data into TRAIN (70%) AND TEST (30%) SET for model building

SHIFT_Final<-selected_data_SHIFT %>% 
  filter(DRG_Cd %in% c('480','482',' 482'))
library(rsample)
# splitting the data into training and testing sets (AMI)
set.seed(1526)
SHIFT_split <- initial_split(SHIFT_Final, prop = 0.70)
train_data_SHIFT <- training(SHIFT_split)
test_data_SHIFT  <- testing(SHIFT_split)

Decision Tree model Its used for predict the Avg_Tot_Pymt_Amt for Surgical hip/femur fracture treatment (SHFFT)

library(rpart.plot)
library(rpart)
train_data_SHIFT_fixed <- na.omit(train_data_SHIFT_fixed)

## Error in na.omit(train_data_SHIFT_fixed): object 'train_data_SHIFT_fixed' not found

# Decision Tree
tree_model <- rpart(Avg_Tot_Pymt_Amt ~ ., data = train_data_SHIFT_fixed)

## Error in is.data.frame(data): object 'train_data_SHIFT_fixed' not found

# Compute R-squared value for the Decision Tree model
tree_pred <- predict(tree_model, test_data_SHIFT_fixed)

## Error in predict(tree_model, test_data_SHIFT_fixed): object 'tree_model' not found

tree_r2 <- 1 - (sum((test_data_SHIFT_fixed$Avg_Tot_Pymt_Amt - tree_pred)^2) / sum((test_data_SHIFT_fixed$Avg_Tot_Pymt_Amt - mean(test_data_SHIFT_fixed$Avg_Tot_Pymt_Amt))^2))

## Error in eval(expr, envir, enclos): object 'test_data_SHIFT_fixed' not found

# Plot the tree
rpart.plot(tree_model, main = "Decision Tree", tweak=2.5)

## Error in rpart.plot(tree_model, main = "Decision Tree", tweak = 2.5): object 'tree_model' not found

Linear regression model Data wrangling and variable selection before buiding the model

# Convert Hospital Type to a factor
train_data_SHIFT$`Hospital Type` <- as.factor(train_data_SHIFT$`Hospital Type`)

# Select variables for the models
selected_vars <- c("Avg_Tot_Pymt_Amt", "educated_employed", "educated_not_employed", "speak_english", "non_english", "Tot_Dschrgs", "Hospital overall rating")
train_data_selected <- train_data_SHIFT[selected_vars]

#Linear regression model
lm_model <- lm(Avg_Tot_Pymt_Amt ~ ., data = train_data_selected)
summary(lm_model)

## 
## Call:
## lm(formula = Avg_Tot_Pymt_Amt ~ ., data = train_data_selected)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -13303  -5111   -113   3322  38051 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               16381.1739   638.9835  25.636  < 2e-16 ***
## educated_employed             1.2929     0.1750   7.387 3.00e-13 ***
## educated_not_employed        -1.2210     0.2109  -5.789 9.29e-09 ***
## speak_english                -0.9792     0.2067  -4.737 2.46e-06 ***
## non_english                   3.8679     4.7297   0.818    0.414    
## Tot_Dschrgs                  78.7023    18.7932   4.188 3.05e-05 ***
## `Hospital overall rating`  -348.2894   156.2554  -2.229    0.026 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5979 on 1078 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.09884,    Adjusted R-squared:  0.09383 
## F-statistic: 19.71 on 6 and 1078 DF,  p-value: < 2.2e-16

Regression Trees (CART)

library(rpart)

cart_model <- rpart(Avg_Tot_Pymt_Amt ~ ., data = train_data_selected)
printcp(cart_model)

## 
## Regression tree:
## rpart(formula = Avg_Tot_Pymt_Amt ~ ., data = train_data_selected)
## 
## Variables actually used in tree construction:
## [1] educated_employed     educated_not_employed non_english          
## [4] speak_english         Tot_Dschrgs          
## 
## Root node error: 4.2837e+10/1087 = 39408232
## 
## n= 1087 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.046868      0   1.00000 1.00176 0.076569
## 2 0.017435      1   0.95313 0.99206 0.076311
## 3 0.010455      2   0.93570 1.02179 0.069931
## 4 0.010331      5   0.90196 1.02050 0.070820
## 5 0.010000      8   0.87097 1.02100 0.070848

rpart.plot(cart_model, extra = 1)

Feedforward ANN The model converged indicating that the actual and predicted values were accurate. theregore the models accuracy was great and useful in predicting the avearage cost of treatment

library(nnet)

# Normalize the data
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}

train_data_normalized <- as.data.frame(lapply(train_data_selected, normalize))

# Create the ANN model with 10 hidden layers
ann_model <- nnet(Avg_Tot_Pymt_Amt ~ ., data = train_data_normalized, size = 5, linout = TRUE)

## # weights:  41
## initial  value 0.000000 
## final  value 0.000000 
## converged

# Plot the ANN model
library(NeuralNetTools)

## Warning: package 'NeuralNetTools' was built under R version 4.2.3

par(mar = c(1, 0, 0, 0) + 1)
par(cex = 0.6)
plotnet(ann_model)

recalculate the adjusted R2 and RMSE for the linear regression and decision tree models, and the RMSE for the ANN model: The ANN model performed better in predicting the average cost of health care using this variables on the test set “Avg_Tot_Pymt_Amt” “educated_employed”, “educated_not_employed”, “speak_english”,“non_english”,“Tot_Dschrgs” ,“Hospital Type”, “Hospital overall rating” “DRG_Cd”, “state_name” because it had the highest R^2 value.

#Decision Tree Model Performance
cart_pred <- predict(cart_model, test_data_SHIFT)
cart_r2 <- 1 - (cart_model$dev / cart_model$deviance)
cart_rmse <- sqrt(mean((test_data_SHIFT$Avg_Tot_Pymt_Amt - cart_pred)^2))
# Linear Regression Model Performance
lm_pred <- predict(lm_model, test_data_SHIFT)
lm_r2 <- summary(lm_model)$adj.r.squared
lm_rmse <- sqrt(mean((test_data_SHIFT$Avg_Tot_Pymt_Amt - lm_pred)^2))

cat("Linear Regression Model:\n")

## Linear Regression Model:

cat("Adjusted R2:", lm_r2, "\n")

## Adjusted R2: 0.09382535

cat("RMSE:", lm_rmse, "\n\n")

## RMSE: NA

# R-squared function
r_squared <- function(true_values, predicted_values) {
  ss_res <- sum((true_values - predicted_values) ^ 2)
  ss_tot <- sum((true_values - mean(true_values)) ^ 2)
  r2 <- 1 - (ss_res / ss_tot)
  return(r2)
}

# Decision Tree Model R-squared
cart_r2 <- r_squared(test_data_SHIFT$Avg_Tot_Pymt_Amt, cart_pred)

cat("Decision Tree Model:\n")

## Decision Tree Model:

cat("R-squared:", cart_r2, "\n")

## R-squared: 0.04651877

cat("RMSE:", cart_rmse, "\n\n")

## RMSE: 5858.518

test_data_ann <- test_data_SHIFT
test_data_ann$Avg_Tot_Pymt_Amt <- normalize(test_data_ann$Avg_Tot_Pymt_Amt)

Final Project: Predictive Factors and Model Comparison for Medicare Procedure Cost Estimation

Hyreen Alice and Banabas Kariuki

2023-01-31