Introduction

This analysis forecasts the number of diagnoses and admissions for heart failure.

Three units of analysis (UA): hospital, state, county Two dependent variables: number of diagnoses (hospital UA), admission rate (state & county)

Methods: OLS, Lasso, enet, RF, extra trees, gradient boosting, bagging, hospital UA OLS, Lasso, enet, GIS for admission rates at state and county UA.

Software: Python 3.8 R Version 4.0.2 R Studio IDE

Setup

Load the printing functions and all libraries

#############################################################################
knitr::opts_chunk$set(echo = TRUE)    
library(Amelia)
## Warning: package 'Amelia' was built under R version 4.0.5
## Warning: package 'Rcpp' was built under R version 4.0.5
library(car)
## Warning: package 'car' was built under R version 4.0.5
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Warning: package 'ggplot2' was built under R version 4.0.5
library(clusterGeneration)
## Warning: package 'clusterGeneration' was built under R version 4.0.5
library(corrplot)
#library(covTest)
library(elsa)        
## Warning: package 'elsa' was built under R version 4.0.5
## Warning: package 'sp' was built under R version 4.0.5
## Warning: package 'raster' was built under R version 4.0.5
## Warning: namespace 'terra' is not available and has been replaced
## by .GlobalEnv when processing object '<unknown>'

## Warning: namespace 'terra' is not available and has been replaced
## by .GlobalEnv when processing object '<unknown>'

## Warning: namespace 'terra' is not available and has been replaced
## by .GlobalEnv when processing object '<unknown>'

## Warning: namespace 'terra' is not available and has been replaced
## by .GlobalEnv when processing object '<unknown>'
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.0.5
library(fpp2)
## Warning: package 'fpp2' was built under R version 4.0.5
## Warning: package 'forecast' was built under R version 4.0.5
## Warning: package 'fma' was built under R version 4.0.5
## Warning: package 'expsmooth' was built under R version 4.0.5
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.0.5
library(ggExtra)     
## Warning: package 'ggExtra' was built under R version 4.0.5
library(ggplot2)
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.0.5
library(glmpath)     
## Warning: package 'glmpath' was built under R version 4.0.5
library(grid)        
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.0.5
library(hts)
## Warning: package 'hts' was built under R version 4.0.5
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.0.5
library(knitr)
## Warning: package 'knitr' was built under R version 4.0.5
library(lars)        
library(lattice)
library(latticeExtra)
## Warning: package 'latticeExtra' was built under R version 4.0.5
library(leaflet)     
## Warning: package 'leaflet' was built under R version 4.0.5
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.5
#library(lmSupport)   
library(maptools)    
## Warning: package 'maptools' was built under R version 4.0.5
library(MASS)
library(mice)
## Warning: package 'mice' was built under R version 4.0.5
library(NbClust)
library(psych)
## Warning: package 'psych' was built under R version 4.0.5
library(QuantPsyc)
## Warning: package 'QuantPsyc' was built under R version 4.0.5
## Warning: package 'boot' was built under R version 4.0.5
library(raster)      
library(RColorBrewer)
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 4.0.5
library(reticulate)
## Warning: package 'reticulate' was built under R version 4.0.5
library(rgdal)       
library(rgeos)       
## Warning: package 'rgeos' was built under R version 4.0.5
library(scales)
## Warning: package 'scales' was built under R version 4.0.5
library(sf)          
## Warning: package 'sf' was built under R version 4.0.5
library(shiny)       
## Warning: package 'shiny' was built under R version 4.0.5
library(sp)          
library(spatialEco)
## Warning: package 'spatialEco' was built under R version 4.0.5
library(spatialreg)  
## Warning: package 'spatialreg' was built under R version 4.0.5
## Warning: package 'spData' was built under R version 4.0.5
## Warning: namespace 'terra' is not available and has been replaced
## by .GlobalEnv when processing object '<unknown>'

## Warning: namespace 'terra' is not available and has been replaced
## by .GlobalEnv when processing object '<unknown>'
library(spData)      
library(spdep)       
## Warning: package 'spdep' was built under R version 4.0.5
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
library(totalcensus)
## Warning: package 'totalcensus' was built under R version 4.0.5
library(tmap)        
## Warning: package 'tmap' was built under R version 4.0.5
library(tmaptools)   
## Warning: package 'tmaptools' was built under R version 4.0.5
library(usmap)       
## Warning: package 'usmap' was built under R version 4.0.5
library(viridis)
## Warning: package 'viridis' was built under R version 4.0.5
## Warning: package 'viridisLite' was built under R version 4.0.5
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.0.5
#############################################################################
print("Set up >50 R Libraries...")
## [1] "Set up >50 R Libraries..."

Environment Set Up

#############################################################################
Sys.setenv('RETICULATE_PYTHON'='C://Users//tf')
use_python('C://Users')
use_condaenv("tf") 
py$RANDOM_SEED = 1234
matplotlib=import('matplotlib.pyplot')
options(scipen=9); options(digits=3)
#############################################################################
print("Invoked Python ...")
## [1] "Invoked Python ..."

Functions

################################PRINT########################################
myprint=function(x,nm) {
  x%>%kbl(col.names = nm)%>%kable_classic(full_width = F, html_font = "Cambria")}
#############################################################################

##############################COREELATION####################################
corfunction=function(d){
  mycorr=cor(d[, 1:ncol(d)]); p.mat=cor_pmat(d[,1:ncol(d)])
  myplot=ggcorrplot(mycorr, hc.order=TRUE,type="lower",colors=c("red", "white","green"),tl.cex = 8, tl.col = "black", lab=TRUE, lab_size=3, p.mat=p.mat, insig="pch", pch=4)
  print(myplot)}
#############################################################################

#################################SCALE#######################################
myscale=function(x) {(x-min(x))/(max(x)-min(x))}
#############################################################################

print("Set up functions for use...")
## [1] "Set up functions for use..."

Read the Data

#############################################################################
mycsv=read.csv("D:/publications/Heart/thf5.csv", stringsAsFactors = TRUE)
#############################################################################

print("Read the data...")
## [1] "Read the data..."

Delete Rows

#############################################################################
missmap(mycsv)

mycsv$na_count = apply(mycsv, 1, function(x) sum(is.na(x))) #count NA's
mycsv=mycsv[mycsv$na_count<5,] #delete columns with 5 or more NAs (33%)
mycsv$na_count=NULL #remove count variable
#############################################################################

print("Deleting largely empty rows..")
## [1] "Deleting largely empty rows.."

Completeness by Column

#############################################################################
myprint(sort(apply(mycsv,2,function(x) sum(is.na(x))/length(x))), "Missing")
Missing
Diagnoses 0.000
Equity 0.000
Ownership_ 0.000
Medical_School_Affiliation_ 0.000
Hospital_Type_ 0.000
State_ 0.000
Urban_Rural_ 0.000
Year_ 0.000
DRG_ 0.000
Discharges 0.000
Medicare 0.000
Net_Income 0.001
Employees 0.002
Affiliated_Physicians 0.010
Profit_Margin 0.010
Staffed_Beds 0.017
Cash_on_Hand 0.039
ER_Visits 0.067
Medicaid 0.084
Surgeries 0.101
#############################################################################

Train and Test Set

#############################################################################
set.seed(1234) #set a pseudo-random number
mys=sort(sample(1:nrow(mycsv),.2*nrow(mycsv), replace=FALSE))#sample 20%
train=mycsv[-mys,]; test=mycsv[mys,] #test set is that 20%
#############################################################################
print("Make training and test sets for model building...")
## [1] "Make training and test sets for model building..."

Impute the Missing

#############################################################################
for(i in 1:13){ #replace 1% missing with means
  mycsv[is.na(mycsv[,i]), i] <- mean(mycsv[,i], na.rm = TRUE) 
  train[is.na(train[,i]), i] <- mean(train[,i], na.rm = TRUE) 
  test[is.na(test[,i]), i] <- mean(test[,i], na.rm = TRUE)    }
missmap(mycsv)

#############################################################################

Describe the Data

We provide descriptive statistics by year and overall using the total dataset.

#############################################################################
mydescribe=function(x,y){describe(x)%>%kbl(caption=y)%>%kable_classic(full_width = F, html_font = "Cambria")}
mydescribe(mycsv[mycsv$Year_=='Y16',],'Year 2016')
Year 2016
vars n mean sd median trimmed mad min max range skew kurtosis se
Diagnoses 1 13534 1474.421 2572.021 423.000 869.809 547.079 11.0 39576.000 39565.000 3.598 19.609 22.109
Net_Income 2 13534 17159592.218 118788532.052 1959540.000 6906718.464 7687041.560 -1214338654.0 3309549602.000 4523888256.000 11.516 259.295 1021083.902
Profit_Margin 3 13534 -0.031 1.253 -0.018 -0.020 0.128 -15.5 62.071 77.525 37.882 1792.513 0.011
Cash_on_Hand 4 13534 20182280.996 119241829.018 1913427.000 5666635.050 2868853.239 -2512769971.0 3880477236.000 6393247207.000 10.509 373.462 1024980.358
Equity 5 13534 173861922.263 628434725.437 33863550.000 75970084.376 51538597.100 -3248783235.0 10242977000.000 13491760235.000 11.384 166.724 5401906.823
Discharges 6 13534 6963.759 9883.288 2743.000 4922.636 3664.987 1.0 129339.000 129338.000 3.039 16.466 84.955
ER_Visits 7 13534 32723.756 33875.156 24969.000 27184.606 24502.930 0.0 543457.000 543457.000 3.027 21.221 291.184
Surgeries 8 13534 6323.656 7934.277 4464.500 4847.949 4333.640 0.0 130741.000 130741.000 4.538 40.539 68.202
Staffed_Beds 9 13534 146.009 171.736 87.000 112.371 91.921 2.0 2753.000 2751.000 3.046 19.535 1.476
Affiliated_Physicians 10 13534 230.402 351.820 104.000 155.111 130.469 1.0 4328.000 4327.000 3.859 23.763 3.024
Employees 11 13534 998.841 1669.983 431.000 644.058 475.915 13.0 26491.000 26478.000 5.193 44.359 14.355
Medicare 12 13534 0.449 0.187 0.423 0.440 0.182 0.0 0.983 0.983 0.404 -0.296 0.002
Medicaid 13 13534 0.087 0.091 0.063 0.071 0.056 0.0 0.869 0.869 2.610 10.601 0.001
Ownership_* 14 13534 2.377 0.793 3.000 2.471 0.000 1.0 3.000 2.000 -0.776 -0.980 0.007
Medical_School_Affiliation_* 15 13534 3.549 0.840 4.000 3.715 0.000 1.0 5.000 4.000 -1.553 1.158 0.007
Hospital_Type_* 16 13534 5.583 2.164 7.000 5.855 0.000 1.0 7.000 6.000 -0.975 -0.961 0.019
State_* 17 13534 25.884 14.622 25.000 25.941 19.274 1.0 51.000 50.000 -0.014 -1.304 0.126
Urban_Rural_* 18 13534 1.543 0.498 2.000 1.554 0.000 1.0 2.000 1.000 -0.172 -1.970 0.004
Year_* 19 13534 1.000 0.000 1.000 1.000 0.000 1.0 1.000 0.000 NaN NaN 0.000
DRG_* 20 13534 1.982 0.809 2.000 1.977 1.483 1.0 3.000 2.000 0.033 -1.471 0.007
mydescribe(mycsv[mycsv$Year_=='Y17',], 'Year 2017')
Year 2017
vars n mean sd median trimmed mad min max range skew kurtosis se
Diagnoses 1 13434 1685.886 3533.270 376.000 816.951 475.915 11.000 53822.000 53811.000 4.377 27.642 30.484
Net_Income 2 13434 17454389.672 117717320.828 2055648.000 7080657.720 7815103.359 -1214338654.000 3309549602.000 4523888256.000 11.679 266.875 1015635.090
Profit_Margin 3 13434 -0.026 1.247 -0.017 -0.019 0.126 -15.454 62.071 77.525 38.881 1839.785 0.011
Cash_on_Hand 4 13434 20255460.279 120964160.745 1952651.000 5726982.511 2928361.838 -2512769971.000 3880477236.000 6393247207.000 10.090 359.031 1043647.999
Equity 5 13434 174351760.311 619985872.380 34675003.500 76916073.119 52488620.493 -3248783235.000 10242977000.000 13491760235.000 11.328 167.239 5349080.347
Discharges 6 13434 7012.265 9904.395 2792.500 4972.690 3710.206 4.000 129339.000 129335.000 3.029 16.381 85.453
ER_Visits 7 13434 32857.654 34004.995 24969.000 27300.944 24508.861 0.000 543457.000 543457.000 3.022 21.081 293.386
Surgeries 8 13434 6351.913 8014.677 4449.000 4857.215 4346.983 0.000 130741.000 130741.000 4.530 39.954 69.149
Staffed_Beds 9 13434 146.631 172.356 87.000 112.873 91.921 3.000 2753.000 2750.000 3.030 19.338 1.487
Affiliated_Physicians 10 13434 231.616 353.665 104.000 155.710 130.469 1.000 4328.000 4327.000 3.835 23.417 3.051
Employees 11 13434 1008.742 1683.827 436.000 651.263 481.845 4.000 26491.000 26487.000 5.198 44.263 14.528
Medicare 12 13434 0.448 0.186 0.422 0.439 0.181 0.002 0.983 0.981 0.411 -0.279 0.002
Medicaid 13 13434 0.087 0.091 0.063 0.071 0.056 0.000 0.869 0.869 2.613 10.687 0.001
Ownership_* 14 13434 2.378 0.793 3.000 2.473 0.000 1.000 3.000 2.000 -0.781 -0.975 0.007
Medical_School_Affiliation_* 15 13434 3.544 0.843 4.000 3.710 0.000 1.000 5.000 4.000 -1.536 1.101 0.007
Hospital_Type_* 16 13434 5.588 2.164 7.000 5.861 0.000 1.000 7.000 6.000 -0.981 -0.952 0.019
State_* 17 13434 25.868 14.609 25.000 25.916 19.274 1.000 51.000 50.000 -0.010 -1.302 0.126
Urban_Rural_* 18 13434 1.544 0.498 2.000 1.555 0.000 1.000 2.000 1.000 -0.178 -1.969 0.004
Year_* 19 13434 2.000 0.000 2.000 2.000 0.000 2.000 2.000 0.000 NaN NaN 0.000
DRG_* 20 13434 1.971 0.809 2.000 1.964 1.483 1.000 3.000 2.000 0.053 -1.470 0.007
mydescribe(mycsv[mycsv$Year_=='Y18',], 'Year 2018')
Year 2018
vars n mean sd median trimmed mad min max range skew kurtosis se
Diagnoses 1 13289 1763.025 3780.335 358.000 825.385 453.676 11.000 57461.000 57450.000 4.400 27.846 32.793
Net_Income 2 13289 17249021.440 117586625.569 2071404.000 7115267.682 7737194.212 -1214338654.000 3309549602.000 4523888256.000 11.705 270.436 1020027.256
Profit_Margin 3 13289 -0.023 1.248 -0.016 -0.018 0.126 -15.454 62.071 77.525 39.344 1854.591 0.011
Cash_on_Hand 4 13289 20469711.526 121724272.987 1999330.000 5809937.758 2991769.675 -2512769971.000 3880477236.000 6393247207.000 10.017 353.916 1055920.055
Equity 5 13289 177575765.876 634571647.806 35038130.000 78143024.189 52950626.081 -3248783235.000 10242977000.000 13491760235.000 11.255 163.063 5504710.877
Discharges 6 13289 7067.705 9937.316 2867.000 5028.271 3822.143 2.000 129339.000 129337.000 3.016 16.264 86.203
ER_Visits 7 13289 33014.752 34051.638 25326.000 27456.406 24815.759 0.000 543457.000 543457.000 3.019 21.065 295.387
Surgeries 8 13289 6372.826 8013.795 4480.000 4880.009 4361.809 0.000 130741.000 130741.000 4.517 39.923 69.517
Staffed_Beds 9 13289 146.889 173.335 86.000 112.901 90.439 3.000 2753.000 2750.000 3.026 19.191 1.504
Affiliated_Physicians 10 13289 233.367 354.940 104.000 157.278 130.469 1.000 4328.000 4327.000 3.819 23.236 3.079
Employees 11 13289 1016.679 1698.380 440.000 656.473 486.293 12.000 26491.000 26479.000 5.181 43.660 14.733
Medicare 12 13289 0.448 0.186 0.421 0.439 0.181 0.002 0.983 0.981 0.426 -0.282 0.002
Medicaid 13 13289 0.087 0.090 0.062 0.070 0.055 0.000 0.869 0.869 2.570 10.324 0.001
Ownership_* 14 13289 2.379 0.795 3.000 2.474 0.000 1.000 3.000 2.000 -0.784 -0.977 0.007
Medical_School_Affiliation_* 15 13289 3.538 0.844 4.000 3.704 0.000 1.000 5.000 4.000 -1.528 1.054 0.007
Hospital_Type_* 16 13289 5.599 2.162 7.000 5.875 0.000 1.000 7.000 6.000 -0.995 -0.929 0.019
State_* 17 13289 25.763 14.606 25.000 25.789 19.274 1.000 51.000 50.000 0.000 -1.301 0.127
Urban_Rural_* 18 13289 1.544 0.498 2.000 1.555 0.000 1.000 2.000 1.000 -0.176 -1.969 0.004
Year_* 19 13289 3.000 0.000 3.000 3.000 0.000 3.000 3.000 0.000 NaN NaN 0.000
DRG_* 20 13289 1.962 0.809 2.000 1.952 1.483 1.000 3.000 2.000 0.070 -1.468 0.007
mydescribe(mycsv, 'All Years')
All Years
vars n mean sd median trimmed mad min max range skew kurtosis se
Diagnoses 1 40257 1640.258 3334.942 385.000 835.737 492.223 11.0 57461.000 57450.000 4.440 29.687 16.621
Net_Income 2 40257 17287488.831 118032673.517 2027491.000 7032593.371 7729250.441 -1214338654.0 3309549602.000 4523888256.000 11.633 265.467 588276.555
Profit_Margin 3 40257 -0.027 1.249 -0.017 -0.019 0.126 -15.5 62.071 77.525 38.697 1828.754 0.006
Cash_on_Hand 4 40257 20301583.348 120637587.895 1949345.000 5733344.451 2927095.697 -2512769971.0 3880477236.000 6393247207.000 10.203 362.054 601259.486
Equity 5 40257 175251339.063 627656053.449 34648343.000 76993362.623 52488269.858 -3248783235.0 10242977000.000 13491760235.000 11.324 165.703 3128246.865
Discharges 6 40257 7014.259 9908.036 2811.000 4973.915 3742.082 1.0 129339.000 129338.000 3.028 16.372 49.382
ER_Visits 7 40257 32864.497 33976.188 25085.000 27312.390 24528.134 0.0 543457.000 543457.000 3.023 21.125 169.338
Surgeries 8 40257 6349.317 7987.273 4464.000 4861.438 4345.501 0.0 130741.000 130741.000 4.529 40.143 39.809
Staffed_Beds 9 40257 146.507 172.468 86.000 112.708 90.439 2.0 2753.000 2751.000 3.034 19.357 0.860
Affiliated_Physicians 10 40257 231.786 353.461 104.000 156.015 130.469 1.0 4328.000 4327.000 3.838 23.474 1.762
Employees 11 40257 1008.034 1683.991 436.000 650.519 481.845 4.0 26491.000 26487.000 5.191 44.102 8.393
Medicare 12 40257 0.448 0.186 0.422 0.439 0.181 0.0 0.983 0.983 0.414 -0.285 0.001
Medicaid 13 40257 0.087 0.091 0.063 0.071 0.056 0.0 0.869 0.869 2.598 10.541 0.000
Ownership_* 14 40257 2.378 0.793 3.000 2.472 0.000 1.0 3.000 2.000 -0.780 -0.977 0.004
Medical_School_Affiliation_* 15 40257 3.544 0.842 4.000 3.710 0.000 1.0 5.000 4.000 -1.539 1.105 0.004
Hospital_Type_* 16 40257 5.590 2.163 7.000 5.863 0.000 1.0 7.000 6.000 -0.984 -0.947 0.011
State_* 17 40257 25.839 14.612 25.000 25.882 19.274 1.0 51.000 50.000 -0.008 -1.302 0.073
Urban_Rural_* 18 40257 1.544 0.498 2.000 1.555 0.000 1.0 2.000 1.000 -0.175 -1.969 0.002
Year_* 19 40257 1.994 0.816 2.000 1.992 1.483 1.0 3.000 2.000 0.011 -1.499 0.004
DRG_* 20 40257 1.971 0.809 2.000 1.964 1.483 1.0 3.000 2.000 0.052 -1.470 0.004
#############################################################################

Graphs

Boxplots for Affilation Data

m=10000
myp1=ggplot(mycsv[mycsv$Year_=="Y16",], aes(x=reorder(Medical_School_Affiliation_,Diagnoses, median), y=Diagnoses, color=Medical_School_Affiliation_))+
  geom_boxplot(notch=TRUE)+
  theme(legend.position = "none")+
  xlab("")+
  coord_flip()+
  scale_y_continuous(name="Year 2016", labels = comma, limits=c(0,m))

myp2=ggplot(mycsv[mycsv$Year_=="Y17",], aes(x=reorder(Medical_School_Affiliation_,Diagnoses, median), y=Diagnoses, color=Medical_School_Affiliation_))+
  geom_boxplot(notch=TRUE)+
  theme(legend.position = "none")+
  xlab("")+
  coord_flip()+
  scale_y_continuous(name="Year 2017", labels = comma, limits=c(0,m))

myp3=ggplot(mycsv[mycsv$Year_=="Y18",], aes(x=reorder(Medical_School_Affiliation_,Diagnoses, median), y=Diagnoses, color=Medical_School_Affiliation_))+
  geom_boxplot(notch=TRUE)+
  theme(legend.position = "none")+
  xlab("")+
  coord_flip()+
  scale_y_continuous(name="Year 2018", labels = comma, limits=c(0,m))

grid.arrange(myp1,myp2,myp3)
## Warning: Removed 259 rows containing non-finite values (stat_boxplot).
## Warning: Removed 518 rows containing non-finite values (stat_boxplot).
## notch went outside hinges. Try setting notch=FALSE.
## Warning: Removed 566 rows containing non-finite values (stat_boxplot).
## notch went outside hinges. Try setting notch=FALSE.

Boxplots for Type Data

m=10000
myp1=ggplot(mycsv[mycsv$Year_=="Y16",], aes(x=reorder(Ownership_,Diagnoses, median), y=Diagnoses, color=Ownership_))+
  geom_boxplot(notch=TRUE)+
  theme(legend.position = "none")+
  xlab("")+
  coord_flip()+
  scale_y_continuous(name="Year 2016", labels = comma, limits=c(0,m))

myp2=ggplot(mycsv[mycsv$Year_=="Y17",], aes(x=reorder(Ownership_,Diagnoses, median), y=Diagnoses, color=Ownership_))+
  geom_boxplot(notch=TRUE)+
  theme(legend.position = "none")+
  xlab("")+
  coord_flip()+
  scale_y_continuous(name="Year 2017", labels = comma, limits=c(0,m))

myp3=ggplot(mycsv[mycsv$Year_=="Y18",], aes(x=reorder(Ownership_,Diagnoses, median), y=Diagnoses, color=Ownership_))+
  geom_boxplot(notch=TRUE)+
  theme(legend.position = "none")+
  xlab("")+
  coord_flip()+
  scale_y_continuous(name="Year 2018", labels = comma, limits=c(0,m))

grid.arrange(myp1,myp2,myp3)
## Warning: Removed 259 rows containing non-finite values (stat_boxplot).
## Warning: Removed 518 rows containing non-finite values (stat_boxplot).
## Warning: Removed 566 rows containing non-finite values (stat_boxplot).

Barplots

#############################################################################
mynames=names(mycsv) #get the names for the data
for (i in 14:20){
  p = ggplot(data = mycsv, aes(x = mycsv[,i], fill=Year_))+   geom_bar(stat='count')+facet_grid(~Year_, scale='free_x')+ ggtitle(mynames[i])+ xlab('')+ylab('Count')+ coord_flip() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
  print(p)
  barplot(sort(table(mycsv[, i]), decreasing = FALSE), main=mynames[i], col="red", horiz=TRUE, las=2, cex.names=.5)
} 

#############################################################################

Correlations Pre-Transform

We investigate hierarchically clustered correlation plots

#############################################################################
corfunction(mycsv[, 1:13])

#############################################################################

Generate Principal Components

#############################################################################
totalpc=prcomp(mycsv[, 6:11], center=TRUE, scale=TRUE, retx=TRUE)
trainpc=prcomp(train[, 6:11], center=TRUE, scale=TRUE, retx=TRUE)
checkvar=data.frame('Total PC'=totalpc$sdev^2/sum(totalpc$sdev^2), 'TrainPC'= trainpc$sdev^2/sum(trainpc$sdev^2))
row.names(checkvar)=c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6")
colnames(checkvar)=c("%Var, Total Data", "%Var, Training Dat")
myprint(checkvar,colnames(checkvar))
%Var, Total Data %Var, Training Dat
PC1 0.848 0.849
PC2 0.065 0.065
PC3 0.034 0.033
PC4 0.028 0.028
PC5 0.016 0.016
PC6 0.010 0.010
mycsv$PC1=totalpc$x[,1]; train$PC1=trainpc$x[,1]
test$PC1=predict(trainpc, newdata=test[,6:11])[,1] #Predict Test with Training
mycsv[,6:11]=train[,6:11]=test[,6:11]=NULL
#############################################################################

Re-order & Re-check Correlations

#############################################################################
corfunction(mycsv[, 1:7,15])

mycsv=mycsv[,c(1,15,2:14)] 
train=train[,c(1,15,2:14)]
test=test[,c(1,15,2:14)]
#############################################################################

Scale for Analysis

#############################################################################
for (i in c(1:8)){mycsv[,i]=myscale(mycsv[,i])
train[,i]=myscale(train[,i]) #update training set
test[,i]=myscale(test[,i])} #update test set
#############################################################################
describe(mycsv[, 1:8])%>%kbl()%>%kable_classic(full_width = F, html_font = "Cambria")
vars n mean sd median trimmed mad min max range skew kurtosis se
Diagnoses 1 40257 0.028 0.058 0.007 0.014 0.009 0 1 1 4.440 29.687 0.000
PC1 2 40257 0.941 0.074 0.968 0.956 0.034 0 1 1 -3.269 19.291 0.000
Net_Income 3 40257 0.272 0.026 0.269 0.270 0.002 0 1 1 11.633 265.467 0.000
Profit_Margin 4 40257 0.199 0.016 0.199 0.199 0.002 0 1 1 38.697 1828.754 0.000
Cash_on_Hand 5 40257 0.396 0.019 0.393 0.394 0.000 0 1 1 10.203 362.054 0.000
Equity 6 40257 0.254 0.047 0.243 0.247 0.004 0 1 1 11.324 165.703 0.000
Medicare 7 40257 0.456 0.189 0.429 0.447 0.184 0 1 1 0.414 -0.285 0.001
Medicaid 8 40257 0.100 0.104 0.072 0.081 0.065 0 1 1 2.598 10.541 0.001

Python Libraries


#############################################################################
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import gc
import sklearn.linear_model   #Need the linear model components
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor as RFR # Random Forest package
from sklearn.ensemble import ExtraTreesRegressor as ETR # Extra Trees package
from sklearn.ensemble import BaggingRegressor as BR 
from sklearn.model_selection import train_test_split as tts
#from sklearn.model_selection import RandomizedSearchCV as RSCV
from xgboost import XGBRegressor as XGB
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet 
from sklearn.linear_model import LinearRegression as OLS
from sklearn.metrics import mean_squared_error, r2_score # evaluation metrics
#############################################################################
print("Loaded 20 or so Python libraries...")
## Loaded 20 or so Python libraries...

Read Data for Python


#############################################################################
mydata, train, test = r.mycsv, r.train,r.test
mydata, train, test=pd.get_dummies(mydata), pd.get_dummies(train), pd.get_dummies(test)
mynames=mydata.columns
xtrain, ytrain, xtest, ytest =train.iloc[:, 1:len(train.columns)], train.iloc[:,0], test.iloc[:, 1:len(test.columns)], test.iloc[:,0]
xtot, ytot=mydata.iloc[:, 1:len(mydata.columns)], mydata.iloc[:,]
#############################################################################
print("Read in R full, train, test data...")
## Read in R full, train, test data...

Initial ML Models


t1=OLS(n_jobs=-1)
t2=RFR(n_estimators = 50, criterion='mse',n_jobs = -1, random_state = 1234)
t3=ETR(n_estimators = 50,criterion='mse', n_jobs = -1, random_state = 1234)
t4=XGB(n_estimators=50, n_jobs=-1,learning_rate=0.1, max_depth=15,objective='reg:squarederror', random_state=1234) 
t5=BR(n_estimators=50, random_state=1234)

def myf(x,name):
    temp=x.fit(xtrain,ytrain)
    print(name,": ", round(temp.score(xtest, ytest),4))
    
myf(t1, 'OLS')
## OLS :  0.4372
myf(t2, 'RFR')
## RFR :  0.829
myf(t3, 'ETR')
## ETR :  0.8658
myf(t4, 'XGB')
## XGB :  0.8155
## 
## C:\Users\tf\lib\site-packages\xgboost\data.py:114: UserWarning: Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption
##   "because it will generate extra copies and increase " +
myf(t5, 'BR')

#############################################################################
## BR :  0.8285

Prediction with Best Model Post-Tuning


#############################################################################
model=ETR(n_estimators = 200, max_depth=70,n_jobs=-1, random_state=1234)  
modelfit=model.fit(xtrain, ytrain)
mypred=model.predict(xtest)
mynames=mydata.columns
print(modelfit.score(xtest,ytest))
## 0.8644536708445771
tempdata={'Variable':mynames[1:len(mynames)],'Importance':modelfit.feature_importances_}
#############################################################################

Descending in Order for Importance


#############################################################################
a=pd.DataFrame(modelfit.feature_importances_)
a.index, a.columns=mynames[1:len(mynames)],["Importance"]
a.sort_values(by=['Importance'], ascending=False,inplace=True)
#############################################################################
print("Sort variables in order of importance...")
## Sort variables in order of importance...

Barplot of Importances

#############################################################################
imp=reticulate::py$a$Importance
impnames=factor(c("STAC", "PC", "DRG293", "UT", "LTAC"), levels=c("STAC", "PC", "DRG293", "UT", "LTAC"))
tempdf=data.frame(Name=impnames[1:5], Importance=imp[1:5])

ggplot(tempdf, aes(x=Name, y=Importance, fill=Name)) +   geom_bar(stat = "identity") + theme(legend.position = "none",axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),axis.title.x = element_blank())+ geom_text(aes(label=round(Importance,3)), position=position_dodge(width=0.9), vjust=-0.25)

#############################################################################

Regression Models, Hospital Unit of Analysis

Next, we build a train regression and evaluate on test set.

#############################################################################
fullOLS=lm(Diagnoses~., data=train)
mylm=summary(fullOLS); myres=residuals(fullOLS); mypred=predict(fullOLS, test)
testfit=lm(test$Diagnoses~mypred); mysum=summary(testfit)
ans=c(mylm$r.squared, mysum$r.squared);names(ans)=c("Train", "Test")
ans%>%kbl(col.names="R2")%>%kable_classic(full_width = F, html_font = "Cambria")
R2
Train 0.501
Test 0.454
#############################################################################

Regression Residual Plot

#############################################################################
layout(mat = matrix(c(1,2),2,1, byrow=TRUE),  height = c(1,8))
par(mar=c(0, 3.1, 1.1, 2.1))
boxplot(myres, horizontal=TRUE, main="", xaxt="n", col="red")
par(mar=c(4, 3.1, 1.1, 2.1))
hist(myres, main="", xlab="", col="blue")

#############################################################################

Complete Regression Model on Entire Dataset

#############################################################################
mylmfinal=lm(Diagnoses~., data=mycsv) #build full regression
mysummary=summary(mylmfinal)
ans=c(mylm$r.squared, mysum$r.squared, mysummary$r.squared); names(ans)=c("Train", "Test", "Full")
ans%>%kbl(col.names="R2")%>%kable_classic(full_width = F, html_font = "Cambria")
R2
Train 0.501
Test 0.454
Full 0.492
summary(aov(mylmfinal))
##                                Df Sum Sq Mean Sq  F value  Pr(>F)    
## PC1                             1   41.0    41.0 23898.59 < 2e-16 ***
## Net_Income                      1    0.1     0.1    41.00 1.5e-10 ***
## Profit_Margin                   1    0.0     0.0     5.65   0.017 *  
## Cash_on_Hand                    1    0.1     0.1    57.11 4.2e-14 ***
## Equity                          1    0.0     0.0    18.86 1.4e-05 ***
## Medicare                        1    0.1     0.1    47.38 5.9e-12 ***
## Medicaid                        1    0.1     0.1    73.65 < 2e-16 ***
## Ownership_                      2    0.4     0.2   120.24 < 2e-16 ***
## Medical_School_Affiliation_     4    0.4     0.1    52.12 < 2e-16 ***
## Hospital_Type_                  5    1.5     0.3   173.42 < 2e-16 ***
## State_                         50    0.9     0.0    10.75 < 2e-16 ***
## Urban_Rural_                    1    0.1     0.1    41.45 1.2e-10 ***
## Year_                           2    0.2     0.1    45.97 < 2e-16 ***
## DRG_                            2   22.0    11.0  6423.70 < 2e-16 ***
## Residuals                   40183   68.9     0.0                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#############################################################################

Function for R2

#############################################################################
myr2=function(pred, actual){
  rss =sum((pred - actual) ^ 2); tss = sum((actual - mean(actual)) ^ 2)
  rsq = 1 - rss/tss; names(rsq)="R2"; return(rsq) }
#############################################################################

Evaluate LASSO

#############################################################################
lambda_seq=10^seq(2, -2, by = -.1)

#Train Data
x=model.matrix(Diagnoses~. , train)[,-1]; y=train$Diagnoses
#Test Data
x2=model.matrix(Diagnoses~., test)[,-1]; y2=test$Diagnoses
#Total Data
x3=model.matrix(Diagnoses~., mycsv)[,-1]; y3=mycsv$Diagnoses
mynames=colnames(x3)

#Best Lambda
best_lam=cv.glmnet(x,y, alpha = 1, lambda = lambda_seq)$lambda.min
names(best_lam)="Best Lambda"

#Model and Predictions
lasso=glmnet(x,y,alpha = 1, lambda = best_lam)
pred1=predict(lasso, s = best_lam, newx=x)
pred2=predict(lasso, s = best_lam, newx=x2)
pred3=predict(lasso, s=best_lam, newx=x3)
t1=myr2(pred1,y); t2=myr2(pred2,y2); t3=myr2(pred3,y3);t4=best_lam
ans=c(t1,t2,t3,t4);names(ans)=c("Train", "Test", "Full", "Best Lambda")
ans%>%kbl(col.names="R2")%>%kable_classic(full_width = F, html_font = "Cambria")
R2
Train 0.328
Test 0.263
Full 0.323
Best Lambda 0.010
lasso=glmnet(x3,y3, alpha=1, lambda=.01)
#############################################################################

Evaluate Elasticnet

#############################################################################
myenet=cv.glmnet(x,y,alpha = .5, lambda = lambda_seq, relaxed=TRUE)
best_lam=myenet$lambda.min
names(best_lam)="Best Lambda"
enet=glmnet(x,y, alpha = .5, lambda = best_lam, gamma=best_gam)
pred3 <- predict(enet, s = best_lam, newx = x)
pred4 <- predict(enet, s = best_lam, newx = x2)
pred5 <- predict(enet, s = best_lam, newx = x3)
t1=myr2(pred3,y); t2=myr2(pred4,y2); t3=myr2(pred5,y3); t4=best_lam
ans=c(t1,t2,t3,t4);names(ans)=c("Train", "Test", "Full", "Best Lambda")
ans%>%kbl(col.names="R2")%>%kable_classic(full_width = F, html_font = "Cambria")
R2
Train 0.417
Test 0.348
Full 0.412
Best Lambda 0.010
enet=glmnet(x3,y3, alpha = .5, lambda = best_lam)
#############################################################################

Combine Coefficient Estimates

#############################################################################
LM=mysummary$coefficients[,1]; LM=c(LM[2:15],0, LM[16:74])
PValueLM=mysummary$coefficients[,4]
LASSO=as.vector(lasso$beta); ENET=as.vector(enet$beta)
TOT=cbind(round(LM,3), round(LASSO,3), round(ENET, 3), round(PValueLM, 3))
mydf1=as.data.frame(TOT)
rownames(mydf1)=mynames
colnames(mydf1)=c("OLS for p<.05","LASSO", "ENET", "P-Value OLS")
mydf1%>% kbl()%>%kable_classic(full_width = F, html_font = "Cambria")
OLS for p<.05 LASSO ENET P-Value OLS
PC1 -0.439 -0.298 -0.323 0.000
Net_Income -0.043 0.000 0.000 0.000
Profit_Margin 0.026 0.000 0.000 0.000
Cash_on_Hand -0.082 0.000 0.000 0.048
Equity 0.012 0.000 0.000 0.000
Medicare 0.029 0.000 0.000 0.025
Medicaid -0.002 0.000 0.000 0.000
Ownership_Proprietary 0.003 0.000 0.000 0.419
Ownership_Voluntary Nonprofit 0.006 0.000 0.000 0.000
Medical_School_Affiliation_Limited 0.002 0.000 0.000 0.000
Medical_School_Affiliation_Major -0.009 0.000 0.000 0.066
Medical_School_Affiliation_No Affiliation 0.000 0.000 0.000 0.000
Medical_School_Affiliation_Unknown -0.005 0.000 0.000 0.708
Hospital_Type_Critical Access Hospital 0.074 0.000 0.000 0.124
Hospital_Type_Department of Defense Hospital 0.000 0.000 0.000 0.000
Hospital_Type_Long Term Acute Care Hospital 0.048 0.000 0.000 0.000
Hospital_Type_Psychiatric Hospital 0.067 0.000 0.000 0.000
Hospital_Type_Rehabilitation Hospital 0.058 0.000 0.000 0.000
Hospital_Type_Short Term Acute Care Hospital 0.084 0.000 0.006 0.000
State_AL 0.005 0.000 0.000 0.178
State_AR 0.003 0.000 0.000 0.479
State_AZ -0.002 0.000 0.000 0.659
State_CA 0.000 0.000 0.000 0.985
State_CO -0.003 0.000 0.000 0.457
State_CT 0.013 0.000 0.000 0.002
State_DC 0.006 0.000 0.000 0.273
State_DE 0.018 0.000 0.000 0.004
State_FL 0.006 0.000 0.000 0.081
State_GA 0.009 0.000 0.000 0.014
State_HI -0.003 0.000 0.000 0.531
State_IA 0.001 0.000 0.000 0.734
State_ID 0.000 0.000 0.000 0.983
State_IL 0.009 0.000 0.000 0.011
State_IN 0.006 0.000 0.000 0.081
State_KS 0.002 0.000 0.000 0.499
State_KY 0.005 0.000 0.000 0.153
State_LA 0.006 0.000 0.000 0.097
State_MA 0.010 0.000 0.000 0.007
State_MD 0.013 0.000 0.000 0.001
State_ME -0.002 0.000 0.000 0.605
State_MI 0.013 0.000 0.000 0.000
State_MN 0.003 0.000 0.000 0.432
State_MO 0.005 0.000 0.000 0.131
State_MS 0.003 0.000 0.000 0.466
State_MT 0.004 0.000 0.000 0.369
State_NC 0.016 0.000 0.000 0.000
State_ND 0.004 0.000 0.000 0.390
State_NE 0.001 0.000 0.000 0.859
State_NH 0.002 0.000 0.000 0.643
State_NJ 0.012 0.000 0.000 0.001
State_NM -0.002 0.000 0.000 0.634
State_NV 0.007 0.000 0.000 0.105
State_NY -0.005 0.000 0.000 0.136
State_OH 0.007 0.000 0.000 0.038
State_OK 0.001 0.000 0.000 0.808
State_OR -0.002 0.000 0.000 0.691
State_PA 0.002 0.000 0.000 0.588
State_RI 0.002 0.000 0.000 0.741
State_SC 0.006 0.000 0.000 0.117
State_SD -0.002 0.000 0.000 0.632
State_TN 0.003 0.000 0.000 0.442
State_TX 0.004 0.000 0.000 0.231
State_UT -0.004 0.000 0.000 0.266
State_VA 0.014 0.000 0.000 0.000
State_VT -0.001 0.000 0.000 0.891
State_WA 0.005 0.000 0.000 0.153
State_WI 0.003 0.000 0.000 0.472
State_WV 0.002 0.000 0.000 0.579
State_WY 0.002 0.000 0.000 0.692
Urban_Rural_Urban 0.004 0.000 0.000 0.000
Year_Y17 0.003 0.000 0.000 0.000
Year_Y18 0.004 0.000 0.000 0.000
DRG_DRG292 -0.040 0.000 -0.016 0.000
DRG_DRG293 -0.056 -0.013 -0.029 0.000
#############################################################################