# call libraries and set seed
library(caret)
library(plotly)
library(dplyr)
library(data.table)
set.seed(123)
SNF_TXc <- read.csv("SNF_TX.csv")
SNF_TXm <- SNF_TXc[, -c(1, 41:44)]
names(SNF_TXm)
## [1] "Distinct_Beneficiaries"
## [2] "Episode_or_Stay_Count"
## [3] "Days_of_Service"
## [4] "Percent_of_Beneficiaries_with_30_or_fewer_Service_Days"
## [5] "Percent_of_Beneficiaries_with_60_or_more_Service_Days"
## [6] "Percent_Dual_Beneficiaries"
## [7] "Percent_Medicare_Beneficiaries_in_a_Rural_ZIP"
## [8] "Average_Age"
## [9] "Percent_Male_Beneficiaries"
## [10] "Percent_Female_Beneficiaries"
## [11] "Percent_White_Beneficiaries"
## [12] "Average_HCC_Score"
## [13] "Average_Number_of_Chronic_Conditions"
## [14] "Percent_of_Beneficiaries_with_Alzheimer.s"
## [15] "Percent_of_Beneficiaries_with_CHF"
## [16] "Percent_of_Beneficiaries_with_Chronic_Kidney_Disease"
## [17] "Percent_of_Beneficiaries_with_COPD"
## [18] "Percent_of_Beneficiaries_with_Depression"
## [19] "Percent_of_Beneficiaries_with_Diabetes"
## [20] "Percent_of_Beneficiaries_with_Hyperlipidemia"
## [21] "Percent_of_Beneficiaries_with_IHD"
## [22] "Percent_of_Beneficiaries_with_RA.OA"
## [23] "Total_Physical_Therapy_Minutes"
## [24] "Total_Occupational_Therapy_Minutes"
## [25] "Total_Speech.Language_Pathology_Minutes"
## [26] "Assessment_Denominator"
## [27] "Percentage_of_RV_Assessments_within_10_minutes_of_Threshold"
## [28] "Percentage_of_RU_Assessments_within_10_minutes_of_Threshold"
## [29] "Percentage_with_an_Inpatient_Hospital_Discharge_Status"
## [30] "Percentage_with_a_Home_Health_Discharge_Status"
## [31] "Percentage_with_an_Unknown_Discharge_Status"
## [32] "Percentage_of_Days_in_SNF_AAA_RUG"
## [33] "Percentage_of_Days_in_SNF_Low._Medium_or_High_Rehabilitation_RUGs"
## [34] "Percentage_of_Days_in_SNF_Very.High_Rehabilitation_RUGs"
## [35] "Percentage_of_Days_in_SNF_Ultra.High_Rehabilitation_RUGs"
## [36] "Percentage_of_Days_in_SNF_Clinically_Complex_RUGs"
## [37] "Percentage_of_Days_in_SNF_Reduced_Physical_Functioning_RUGs"
## [38] "Percentage_of_Days_in_SNF_Special_High_Care_RUGs"
## [39] "Percentage_of_Days_in_SNF_Behavioral_Symptom_RUGs"
## [40] "Total_Coinsurance_Amount"
## [41] "Total_Charge_Amount"
control <- rfeControl(functions = rfFuncs, method = "repeatedcv", repeats = 3, verbose = FALSE)
rfe_profile <- rfe(SNF_TXm[, 1:40], as.matrix(SNF_TXm[, 41]), sizes = c(5:40), rfeControl = control)
plot(rfe_profile, type = c("g", "o"))
From Recursive Feature Elimination, we got a best combination of 33 variables. We will create a new dataset called SNF_TXm use all 33 independent variables and the one target variable we need to predict.
# the final dataset for model building
SNF_TXm <- SNF_TXm[, c(paste(rfe_profile$optVariables), "Total_Charge_Amount")]
inTraining <- createDataPartition(SNF_TXm$Total_Charge_Amount, p = .75, list = FALSE)
training <- SNF_TXm[inTraining, ]
testing <- SNF_TXm[-inTraining, ]
fitControl <- trainControl(method = "cv", number = 10, preProcOptions = c("center", "scale"))
# build the model
rf <- train(Total_Charge_Amount~., data = training, method = "rf", trControl = fitControl, importance = TRUE )
rf
## Random Forest
##
## 910 samples
## 36 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 819, 820, 819, 818, 819, 819, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 1118172 0.7740022 650717.1
## 19 1042089 0.8099059 591252.2
## 36 1023887 0.8162113 584914.3
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 36.
# test the model and make the evaluation
rf_testing <- predict(rf, testing)
rf_evaluate <- postResample(rf_testing, testing$Total_Charge_Amount)
round(rf_evaluate, 4)
## RMSE Rsquared MAE
## 794120.1652 0.8429 567347.8537
# build the model
svm <- train(Total_Charge_Amount~., data = training, method = "svmLinear", trControl = fitControl, importance = TRUE)
svm
## Support Vector Machines with Linear Kernel
##
## 910 samples
## 36 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 819, 821, 819, 819, 818, 819, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 1108929 0.7688972 596663.7
##
## Tuning parameter 'C' was held constant at a value of 1
# test the model and make the evaluation
svm_testing <- predict(svm, testing)
svm_evaluate <- postResample(svm_testing, testing$Total_Charge_Amount)
round(svm_evaluate, 4)
## RMSE Rsquared MAE
## 777142.7827 0.8438 564172.1103
model_comparison <- resamples(list(Random_Forest = rf, SVM = svm))
summary(model_comparison)
##
## Call:
## summary.resamples(object = model_comparison)
##
## Models: Random_Forest, SVM
## Number of resamples: 10
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Random_Forest 468595.6 543213.5 599499.8 584914.3 627595.6 681573.8 0
## SVM 479865.5 517486.6 566799.6 596663.7 621718.7 929656.6 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Random_Forest 695185.8 824127.6 929784.2 1023887 1040242 1906623 0
## SVM 664067.5 747964.8 951668.8 1108929 1143191 2565805 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Random_Forest 0.7544647 0.7614560 0.8209345 0.8162113 0.8597519 0.8959304 0
## SVM 0.6679970 0.7082341 0.7663113 0.7688972 0.8359856 0.8850842 0
dotplot(model_comparison)
Random Forest and SVM have similar performance. Random Forest has a slight higher Rsquared while SVM took less time to train. If the dataset is not significantly large, I would recommand to use Random Forest model for the future training.
We used correlation coefficient to generate 10 high-impact factors on Total_Charge_Amount in the part 1 analysis. Here, we will use the random forest model we built to find out the importance ranking of all 33 variables/factors.
# variable importance ranking
feature_importance <- varImp(rf, scale = FALSE)
plot(feature_importance)
The importance ranking confirmed our analysis in the first part that Days_of_Service is a big drive of the costs.
Percentage of days in SNF Rehabilitaion of different levels(Low, Medium, High, Very-High, Ultra-High) also plays an importance role on determine total costs.
Total Occupatinal Therapy Minutes and Total Physical Physical Therapy Minutes also drive the total costs up. But from the comparison of Texas and the Nation from part 1 analysis, we can see the Therapy Minutes per stay are very similar for Texas and the Nation, while the total cost of Texas is higher than the Nation.
We have concluded that Days_of_Service is the big drive of the total cost for SNF, we also suspect that Percentage of days in SNF Rehabilitation of different levels and Total Therapy Minutes are related to Days of Service.
For the next step, we could dive into the analysis of the factors which effect Days of Service focusing on the following aspects: