library("data.table")
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("tidyverse")
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.1 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.1.8
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ lubridate::hour() masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ lubridate::mday() masks data.table::mday()
## ✖ lubridate::minute() masks data.table::minute()
## ✖ lubridate::month() masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second() masks data.table::second()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ lubridate::wday() masks data.table::wday()
## ✖ lubridate::week() masks data.table::week()
## ✖ lubridate::yday() masks data.table::yday()
## ✖ lubridate::year() masks data.table::year()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library("maps")
##
## Attaching package: 'maps'
##
## The following object is masked from 'package:purrr':
##
## map
library("ggplot2")
library("nnet")
library("rpart")
library("RColorBrewer")
Loading data The datasets are loaded using the fread function from the data.table package. Three data frames are created: HospitalRating, CensusData, and MedicareCharges.
HospitalRating<-fread("Hospital general information.csv")
CensusData<-fread("Census.csv", header = T)
MedicareCharges<-fread("Medicare.csv")
Variable selection and data cleaning Methodology: Data Integration: Utilize the detailed process outlined in the script below to merge hospital ratings, census data, and Medicare charges into a singular dataset that offers a comprehensive view of healthcare costs in relation to socioeconomic factors and hospital characteristics. The datasets are loaded using the fread function from the data.table package. Three data frames are created: HospitalRating, CensusData, and MedicareCharges.
#Selecting Variables
HospitalRating_filtered<- HospitalRating %>% select(`Hospital Type`, `Hospital Ownership`, `Hospital overall rating`, `Provider ID`)
CensusData_filtered<- CensusData %>% select(NAME, B16010_042E, B16010_048E,B16010_043E, B16010_053E)
# Selecting Variables and renaming them
CensusData_filtered <- CensusData %>%
select(NAME, educated_employed = B16010_042E, educated_not_employed = B16010_048E,
speak_english = B16010_043E, non_english = B16010_053E)
joining datasets
#Joining Data by Provider ID for Medical Charges data and the Hospital Rating Data
joined_data <- inner_join(MedicareCharges, HospitalRating_filtered, by = c("Rndrng_Prvdr_CCN" = "Provider ID" ))
# Filter and separate the NAME column to have a separate ZIP code for combining
CensusData_filtered1 <- data.frame(CensusData_filtered %>% mutate(NAME = str_trim(NAME)) %>%
separate(NAME, into = c("name1", "name2"), sep = " ", remove = FALSE))
#Converting the ZIP code variable (Rndrng_Prvdr_Zip5) to Character
joined_data$Rndrng_Prvdr_Zip5<- as.character(joined_data$Rndrng_Prvdr_Zip5)
#Joining the data sets, successfully joining the three data sets by specific variables (Here, by ZIP Code)
FullData <- inner_join(CensusData_filtered1, joined_data, by = c("name2" = "Rndrng_Prvdr_Zip5" ))
#Removing some variables from the data set and Renaming the Zip Code Variable for easy reference
FullData <- FullData %>%
select(-Rndrng_Prvdr_City, -Rndrng_Prvdr_St, -Rndrng_Prvdr_Org_Name, -name1, -NAME, -Rndrng_Prvdr_State_FIPS, -Rndrng_Prvdr_RUCA_Desc, -Rndrng_Prvdr_RUCA, -Avg_Submtd_Cvrd_Chrg, -Avg_Mdcr_Pymt_Amt, -Rndrng_Prvdr_CCN) %>% rename(ZIP_Code = name2)
# Create a data frame with state abbreviations and names
state_df <- data.frame(state.abb, state.name)
colnames(state_df) <- c("state_abbrv", "state_name")
# Merge the state names with your dataset using the state abbreviation
FullData <- merge(FullData, state_df, by.x = "Rndrng_Prvdr_State_Abrvtn", by.y = "state_abbrv", all.x = TRUE)
Data Analysis: Perform statistical analysis to identify correlations between hospital ratings, ownership types, and average total payment amounts for AMI treatment. Analyze the impact of socioeconomic factors, such as education and employment status, on healthcare costs for AMI across different geographical regions. Use geographical information to map the distribution of healthcare costs, identifying regions (I will be reducing the number of states to get a number easier to compare visually) with significantly higher or lower costs and investigating potential reasons.
Visualization: Create interactive visualizations to present the findings effectively. Use histograms to show the distribution of average total payment amounts and maps to illustrate geographical disparities in healthcare costs.
Interpretation and Discussion: Discuss the implications of the findings for healthcare policy and management. Explore how socioeconomic factors and hospital characteristics influence healthcare costs for AMI and suggest strategies for addressing disparities.
Outcome: The project will culminate in a detailed report and presentation that highlights key insights into the factors affecting healthcare costs for AMI treatment. By shedding light on these disparities, the project aims to contribute to informed policymaking and healthcare management practices that can lead to more equitable healthcare outcomes.
This proposal integrates data analysis with a focus on real-world applications, making it a fitting project for demonstrating proficiency in course objectives and contributing to broader discussions in healthcare research and policy.
Analysing Acute myocardial infarction (AMI) Data Creating a histogram that shows the distribution of average total payment amounts for Acute Myocardial Infarction(AMI) with MS-DRGs 280-282 across states.
#Filtering the Data for AMI MS-DRGs 280-282
FullData_AMI <- FullData %>% filter(DRG_Cd >= 280 & DRG_Cd <= 282)
library(ggplot2)
library(plotly)
#Filtering the Data for AMI MS-DRGs 280-282
FullData_AMI <- FullData %>% filter(DRG_Cd >= 280 & DRG_Cd <= 282)
p <- ggplot(FullData_AMI, aes(x = Avg_Tot_Pymt_Amt, fill = state_name)) +
geom_histogram(bins = 50, col = "white") +
labs(title = "Distribution of Avg_Tot_Pymt_Amt for AMI MS-DRGs 280-282",
x = "Average Total Payment Amount",
y = "Count") +
theme(plot.title = element_text(hjust = 0.5),
plot.background = element_rect(fill = "white"),
panel.background = element_rect(fill = "white"),
panel.grid.major = element_line(colour = "gray", size = 0.5))
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
Creating a Density map for the US showing various costs per State: This code chunk filters the data for AMI MS-DRGs 280-282 and creates a density plot to visualize the distribution of average total payment amounts for each state.
# Filtering the Data for AMI MS-DRGs 280-282
FullData_AMI <- FullData %>% filter(DRG_Cd >= 280 & DRG_Cd <= 282)
# Density plot
p<-ggplot(FullData_AMI, aes(x = Avg_Tot_Pymt_Amt, y = after_stat(density), fill = state_name)) +
geom_density(alpha = 0.5) +
labs(title = "Density Plot of Avg_Tot_Pymt_Amt for AMI MS-DRGs 280-282",
x = "Average Total Payment Amount",
y = "Density") +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(p)
Creating an interactive heatmap to show Avg_Tot_Pymt_Amt per state: This code chunk calculates the mean average total payment amount for AMI per state and creates an interactive heatmap to visualize these values.
state_data <- FullData_AMI %>%
group_by(state_name) %>%
summarize(avg_amt = mean(Avg_Tot_Pymt_Amt))
# Create the ggplot object
heatmap_plot <- ggplot(state_data, aes(x = state_name, y = 1, fill = avg_amt)) +
geom_tile() +
scale_fill_gradient2(low = "#F7FBFF", mid = "#6BAED6", high = "#08306B", midpoint = 0) +
ggtitle("Average Total Payment Amount for Acute Myocardial Infarction by State") +
xlab("") +
ylab("") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 10),
axis.ticks.x = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank())
# Convert the ggplot object to a plotly object and display it
interactive_heatmap <- ggplotly(heatmap_plot)
interactive_heatmap
Histogram Plot of Avg_Tot_Pymt_Amt for AMI MS-DRGs 280-282: This code chunk creates a histogram to visualize the distribution of average total payment amounts for AMI MS-DRGs 280-282.
FullData_AMI <- FullData %>% filter(DRG_Cd >= 280 & DRG_Cd <= 282)
p <- ggplot(FullData_AMI, aes(x = Avg_Tot_Pymt_Amt, fill = state_name)) +
geom_histogram(bins = 100, col = "white", binwidth = NULL, position='stack') +
labs(title = "Avg_Tot_Pymt_Amt for AMI MS-DRGs 280-282",
x = "Average Total Payment Amount",
y = "Count") +
theme(plot.title = element_text(hjust = 0.5, size = 10),
plot.background = element_rect(fill = "white"),
panel.background = element_rect(fill = "white"),
panel.grid.major = element_line(colour = "gray", size = 0.9),
axis.title = element_text(size = 16),
axis.text = element_text(size = 14),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12))
ggplotly(p)
Data wrangling and variable selection: This code chunk selects relevant variables for further analysis, converts character variables to numeric, and removes variables with only one level.
# Select relevant variables for the analysis
selected_data <- FullData_AMI %>%
select(Avg_Tot_Pymt_Amt, `Hospital Type`, `Hospital Ownership`, `Hospital overall rating`, educated_employed, educated_not_employed, speak_english, non_english,DRG_Cd)
# Convert character variables to numeric
selected_data$educated_employed <- as.numeric(selected_data$educated_employed)
selected_data$educated_not_employed <- as.numeric(selected_data$educated_not_employed)
selected_data$speak_english <- as.numeric(selected_data$speak_english)
selected_data$non_english <- as.numeric(selected_data$non_english)
# Identify factor variables with only one level
single_level_factors <- sapply(selected_data, function(x) is.factor(x) && length(levels(x)) == 1)
# Remove those variables from the data
selected_data <- selected_data[!single_level_factors]
Splitting data into training and test: The data is split into a training set (70%) and a testing set (30%) to evaluate the performance of the linear regression and regression tree models. for the variables with DRG_Cd 280,281,282
AMI_data <- selected_data %>%
filter(DRG_Cd %in% c('280', '281','282'))
library(rsample)
# splitting the data into training and testing sets (AMI)
set.seed(1526)
AMI_split <- initial_split(AMI_data, prop = 0.70)
train_data <- training(AMI_split)
test_data <- testing(AMI_split)
Creating the Linear Regression Model: Linear Regression Model: This code chunk fits a linear regression model to the selected data and calculates the adjusted R2 and root mean-square error (RMSE) to evaluate the model’s performance. The predicted values are plotted against the actual values to visualize the model’s accuracy.
# Select relevant variables
selected_data <- FullData_AMI %>%
select('Rndrng_Prvdr_State_Abrvtn','DRG_Cd','Avg_Tot_Pymt_Amt', 'Hospital Type', 'Hospital Ownership', 'Hospital overall rating',
'educated_employed','educated_not_employed','Tot_Dschrgs')
# Convert categorical variables to factors
selected_data$'Hospital Type' <- as.factor(selected_data$'Hospital Type')
selected_data$'Hospital Ownership' <- as.factor(selected_data$'Hospital Ownership')
# Remove factor variables with less than 2 levels
selected_data <- selected_data %>%
select_if(~ !is.factor(.) || nlevels(.) >= 2)
# Fit the linear regression model
lin_reg_model <- lm(Avg_Tot_Pymt_Amt ~ ., data = selected_data)
# Calculate adjusted R2 and root mean-square error
lin_reg_adj_r2 <- summary(lin_reg_model)$adj.r.squared
lin_reg_rmse <- sqrt(mean(lin_reg_model$residuals^2))
cat("Adjusted R2 for Linear Regression Model:", lin_reg_adj_r2, "\n")
## Adjusted R2 for Linear Regression Model: 0.8382889
cat("Root Mean-Square Error for Linear Regression Model:", lin_reg_rmse, "\n")
## Root Mean-Square Error for Linear Regression Model: 1097.179
# Predict the values using the linear regression model
predictions_lin_reg <- predict(lin_reg_model, selected_data)
## Warning in predict.lm(lin_reg_model, selected_data): prediction from a
## rank-deficient fit may be misleading
# Plot the predicted values against the actual values
plot(selected_data$Avg_Tot_Pymt_Amt, predictions_lin_reg,
xlab = "Actual Values", ylab = "Predicted Values",
main = "Linear Regression Model: Actual vs. Predicted Values")
abline(0, 1, col = "red", lwd = 2)
# Add the adjusted R2 and RMSE to the plot
text(x = min(selected_data$Avg_Tot_Pymt_Amt), y = max(predictions_lin_reg),
labels = paste("Adjusted R2 (Linear Regression):", round(lin_reg_adj_r2, 4), "\n",
"Root Mean-Square Error (Linear Regression):", round(lin_reg_rmse, 4)),
pos = 4)
Creating the Regression Trees (CART) model: Regression Trees (CART) model: This code chunk fits a regression tree model to the training data and predicts the test data. The R2, adjusted R2, and RMSE are calculated to evaluate the model’s performance. The predicted values are plotted against the actual values to visualize the model’s accuracy.
library(rpart)
# Fit the regression tree model
tree <- rpart(Avg_Tot_Pymt_Amt ~ ., data = train_data)
# Predict the test data
predictions_tree <- predict(tree, test_data)
# Calculate the adjusted R2 and root mean-square error
# Calculate the R2 and adjusted R2
lm_tree <- lm(test_data$Avg_Tot_Pymt_Amt ~ predictions_tree)
r2_tree <- summary(lm_tree)$r.squared
adj_r2_tree <- summary(lm_tree)$adj.r.squared
# Calculate the root mean-square error
rmse_tree <- sqrt(mean((test_data$Avg_Tot_Pymt_Amt - predictions_tree)^2))
cat("R2 (Regression Tree):", r2_tree, "\n")
## R2 (Regression Tree): 0.5365508
cat("Adjusted R2 (Regression Tree):", adj_r2_tree, "\n")
## Adjusted R2 (Regression Tree): 0.5361152
cat("Root Mean-Square Error (Regression Tree):", rmse_tree, "\n")
## Root Mean-Square Error (Regression Tree): 2535.656
# Plot the predicted values against the actual values
plot(test_data$Avg_Tot_Pymt_Amt, predictions_tree,
xlab = "Actual Values", ylab = "Predicted Values",
main = "Regression Tree Model: Actual vs. Predicted Values")
abline(0, 1, col = "red", lwd = 2)
# Add the R2, adjusted R2, and RMSE to the plot
text(x = min(test_data$Avg_Tot_Pymt_Amt), y = max(predictions_tree),
labels = paste("R2 (Regression Tree):", round(r2_tree, 4), "\n",
"Adjusted R2 (Regression Tree):", round(adj_r2_tree, 4), "\n",
"Root Mean-Square Error (Regression Tree):", round(rmse_tree, 4)),
pos = 4)
Create the Feedforward ANN model with 5 hidden layers
# Install and load the neuralnet package
if (!require(neuralnet)) {
install.packages("neuralnet")
library(neuralnet)
}
## Loading required package: neuralnet
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
# Identify non-numeric variables
non_numeric_vars <- sapply(train_data, function(x) !is.numeric(x))
# Convert non-numeric variables to numeric variables if possible
train_data[, non_numeric_vars] <- lapply(train_data[, non_numeric_vars], function(x) if (!any(is.na(as.numeric(x)))) as.numeric(x) else x)
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
test_data[, non_numeric_vars] <- lapply(test_data[, non_numeric_vars], function(x) if (!any(is.na(as.numeric(x)))) as.numeric(x) else x)
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
# Remove remaining non-numeric variables
train_data <- train_data[, sapply(train_data, is.numeric)]
test_data <- test_data[, sapply(test_data, is.numeric)]
# Normalize the data
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
train_data_norm <- as.data.frame(lapply(train_data, normalize))
test_data_norm <- as.data.frame(lapply(test_data, normalize))
# Fit the ANN model with 2 hidden layers
set.seed(123)
ann <- neuralnet(Avg_Tot_Pymt_Amt ~ ., data = train_data_norm, hidden = c(5, 5), linear.output = TRUE)
# Predict the test data
predictions_ann <- predict(ann, test_data_norm)
# Re-scale the predictions
predictions_ann <- predictions_ann * (max(test_data$Avg_Tot_Pymt_Amt) - min(test_data$Avg_Tot_Pymt_Amt)) + min(test_data$Avg_Tot_Pymt_Amt)
# Calculate the adjusted R2 and root mean-square error
# Calculate the adjusted R2 and root mean-square error
lm_ann <- lm(test_data$Avg_Tot_Pymt_Amt ~ predictions_ann)
adj_r2_ann <- summary(lm_ann)$adj.r.squared
rmse_ann <- sqrt(mean((test_data$Avg_Tot_Pymt_Amt - predictions_ann)^2))
cat("Adjusted R2 (ANN):", adj_r2_ann, "\n")
## Adjusted R2 (ANN): 0.442596
cat("Root Mean-Square Error (ANN):", rmse_ann, "\n")
## Root Mean-Square Error (ANN): 3472.15
# Plot the predicted values against the actual values
plot(test_data$Avg_Tot_Pymt_Amt, predictions_ann,
xlab = "Actual Values", ylab = "Predicted Values",
main = "Artificial Neural Network Model: Actual vs. Predicted Values")
abline(0, 1, col = "red", lwd = 2)
# Add the adjusted R2 and RMSE to the plot
text(x = min(test_data$Avg_Tot_Pymt_Amt), y = max(predictions_ann),
labels = paste("Adjusted R2 (ANN):", round(adj_r2_ann, 4), "\n",
"Root Mean-Square Error (ANN):", round(rmse_ann, 4)),
pos = 4)
compare the performance of each model: Linear regression had the highest value of R2 therefore it performed well in predicting average cost for AMI.
# Compare performance of the three models
performance <- data.frame(
Model = c("Linear Regression", "Regression Tree", "Artificial Neural Network"),
Adjusted_R2 = c(lin_reg_adj_r2, adj_r2_tree, adj_r2_ann),
RMSE = c(lin_reg_rmse, rmse_tree, rmse_ann)
)
performance <- performance %>%
arrange(desc(Adjusted_R2), RMSE) %>%
mutate(Rank = row_number())
cat("Comparison of Model Performance:\n")
## Comparison of Model Performance:
print(performance)
## Model Adjusted_R2 RMSE Rank
## 1 Linear Regression 0.8382889 1097.179 1
## 2 Regression Tree 0.5361152 2535.656 2
## 3 Artificial Neural Network 0.4425960 3472.150 3
Comparing Model Performace for Regression Tree, ANN and Linear regression
# Plot the performance comparison
library(ggplot2)
# Adjusted R2 comparison
ggplot(performance, aes(x = Model, y = Adjusted_R2, fill = Model)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Adjusted R2 Comparison",
x = "Model",
y = "Adjusted R2") +
theme(legend.position = "none")
# RMSE comparison
ggplot(performance, aes(x = Model, y = RMSE, fill = Model)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Root Mean-Square Error Comparison",
x = "Model",
y = "RMSE") +
theme(legend.position = "none")
Based on the R-squared and RMSE values obtained for the three models, the Linear Regression model performed the best in predicting the average cost for healthcare for Acute Myocardial Infarction (AMI). The Linear Regression model had the highest R-squared value of 0.8382889 and the lowest RMSE value of 1097.179, indicating that it explains more of the variance in the data and has better accuracy in predicting the average cost compared to the other two models. The Regression Tree model ranked second, with an R-squared value of 0.5361152 and an RMSE value of 2535.656. Although this model performed better than the Artificial Neural Network model, it was not as accurate as the Linear Regression model. The Artificial Neural Network model had the lowest performance among the three models, with an R-squared value of 0.4425960 and an RMSE value of 3472.150. This suggests that the ANN model might not be the best choice for predicting the average cost for healthcare for AMI in this case. In conclusion, the Linear Regression model is the most suitable model for accurately predicting the average cost of healthcare for AMI patients in the Medicare system. This model takes into account factors such as hospital rating, employment, and English language proficiency, which were found to be significant predictors of the average cost of treatment. Healthcare providers and policymakers can use this model to better understand the factors influencing healthcare costs and to make more informed decisions about resource allocation and cost management.
Surgical hip/femur fracture treatment (SHFFT)
#Filtering the Data for Surgical hip/femur fracture treatment (SHFFT)
FullData_SHFFT <- FullData %>% filter(DRG_Cd >= 480 & DRG_Cd <= 482)
# Calculate average cost per state
FullData_SHFFT <- FullData_SHFFT %>% group_by(Rndrng_Prvdr_State_Abrvtn)
Creating a histogram showing Avg_Tot_Pymt_Amt for Surgical hip/femur fracture treatment (SHFFT)
library(plotly)
# Filter data for SHFFT
FullData_SHFFT <- FullData %>% filter(DRG_Cd >= 480 & DRG_Cd <= 482)
# Create histogram of average total payment amount for SHFFT
histogram <- ggplot(FullData_SHFFT, aes(x = Avg_Tot_Pymt_Amt, fill = Rndrng_Prvdr_State_Abrvtn)) +
geom_histogram(binwidth = 5000, color = "black") +
labs(title = "Distribution of Average Total Payment Amount for Surgical Hip/Femur Fracture Treatment",
x = "Average Total Payment Amount",
y = "Count",
fill = "State") +
scale_fill_discrete(na.translate = FALSE) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 10),
axis.title = element_text(face = "bold", size = 14),
legend.title = element_text(face = "bold", size = 14),
legend.position = "right",
legend.box.spacing = unit(0.2, "cm"),
legend.margin = margin(t = 0, b = 0, l = 0, r = 20))
# Make the histogram interactive using plotly
ggplotly(histogram, tooltip = c("fill", "x", "y")) %>%
layout(title = list(text = "Distribution of Average Total Payment Amount for Surgical Hip/Femur Fracture Treatment",
x = 0.5),
xaxis = list(title = "Average Total Payment Amount"),
yaxis = list(title = "Count"))
Data wrangling and variable selection
library(dplyr)
# Convert character columns to numeric
FullData_SHFFT$educated_employed <- as.numeric(FullData_SHFFT$educated_employed)
FullData_SHFFT$educated_not_employed <- as.numeric(FullData_SHFFT$educated_not_employed)
FullData_SHFFT$speak_english <- as.numeric(FullData_SHFFT$speak_english)
FullData_SHFFT$non_english <- as.numeric(FullData_SHFFT$non_english)
FullData_SHFFT$`Hospital overall rating` <- as.numeric(FullData_SHFFT$`Hospital overall rating`)
# Select relevant variables
selected_data_SHIFT <- FullData_SHFFT %>% select(Avg_Tot_Pymt_Amt, educated_employed, educated_not_employed, speak_english, non_english, Tot_Dschrgs, `Hospital Type`, `Hospital overall rating`, DRG_Cd,state_name)
Splitting data into TRAIN (70%) AND TEST (30%) SET for model building
SHIFT_Final<-selected_data_SHIFT %>%
filter(DRG_Cd %in% c('480','482',' 482'))
library(rsample)
# splitting the data into training and testing sets (AMI)
set.seed(1526)
SHIFT_split <- initial_split(SHIFT_Final, prop = 0.70)
train_data_SHIFT <- training(SHIFT_split)
test_data_SHIFT <- testing(SHIFT_split)
Decision Tree model Its used for predict the Avg_Tot_Pymt_Amt for Surgical hip/femur fracture treatment (SHFFT)
library(rpart.plot)
library(rpart)
train_data_SHIFT_fixed <- na.omit(train_data_SHIFT_fixed)
## Error in na.omit(train_data_SHIFT_fixed): object 'train_data_SHIFT_fixed' not found
# Decision Tree
tree_model <- rpart(Avg_Tot_Pymt_Amt ~ ., data = train_data_SHIFT_fixed)
## Error in is.data.frame(data): object 'train_data_SHIFT_fixed' not found
# Compute R-squared value for the Decision Tree model
tree_pred <- predict(tree_model, test_data_SHIFT_fixed)
## Error in predict(tree_model, test_data_SHIFT_fixed): object 'tree_model' not found
tree_r2 <- 1 - (sum((test_data_SHIFT_fixed$Avg_Tot_Pymt_Amt - tree_pred)^2) / sum((test_data_SHIFT_fixed$Avg_Tot_Pymt_Amt - mean(test_data_SHIFT_fixed$Avg_Tot_Pymt_Amt))^2))
## Error in eval(expr, envir, enclos): object 'test_data_SHIFT_fixed' not found
# Plot the tree
rpart.plot(tree_model, main = "Decision Tree", tweak=2.5)
## Error in rpart.plot(tree_model, main = "Decision Tree", tweak = 2.5): object 'tree_model' not found
Linear regression model Data wrangling and variable selection before buiding the model
# Convert Hospital Type to a factor
train_data_SHIFT$`Hospital Type` <- as.factor(train_data_SHIFT$`Hospital Type`)
# Select variables for the models
selected_vars <- c("Avg_Tot_Pymt_Amt", "educated_employed", "educated_not_employed", "speak_english", "non_english", "Tot_Dschrgs", "Hospital overall rating")
train_data_selected <- train_data_SHIFT[selected_vars]
#Linear regression model
lm_model <- lm(Avg_Tot_Pymt_Amt ~ ., data = train_data_selected)
summary(lm_model)
##
## Call:
## lm(formula = Avg_Tot_Pymt_Amt ~ ., data = train_data_selected)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13303 -5111 -113 3322 38051
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16381.1739 638.9835 25.636 < 2e-16 ***
## educated_employed 1.2929 0.1750 7.387 3.00e-13 ***
## educated_not_employed -1.2210 0.2109 -5.789 9.29e-09 ***
## speak_english -0.9792 0.2067 -4.737 2.46e-06 ***
## non_english 3.8679 4.7297 0.818 0.414
## Tot_Dschrgs 78.7023 18.7932 4.188 3.05e-05 ***
## `Hospital overall rating` -348.2894 156.2554 -2.229 0.026 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5979 on 1078 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.09884, Adjusted R-squared: 0.09383
## F-statistic: 19.71 on 6 and 1078 DF, p-value: < 2.2e-16
recalculate the adjusted R2 and RMSE for the linear regression and decision tree models, and the RMSE for the ANN model: The ANN model performed better in predicting the average cost of health care using this variables on the test set “Avg_Tot_Pymt_Amt” “educated_employed”, “educated_not_employed”, “speak_english”,“non_english”,“Tot_Dschrgs” ,“Hospital Type”, “Hospital overall rating” “DRG_Cd”, “state_name” because it had the highest R^2 value.
# R-squared function
r_squared <- function(true_values, predicted_values) {
ss_res <- sum((true_values - predicted_values) ^ 2)
ss_tot <- sum((true_values - mean(true_values)) ^ 2)
r2 <- 1 - (ss_res / ss_tot)
return(r2)
}