Introduction

This analysis attempts to predict the number of diagnoses of heart failure. The dependent variable is the number of diagnoses. The unit of analysis is the hospital level by year. Independent variable groups include financial, workload, technical, geospatial, and temporal variables.

Setup

Load the printing functions and all libraries

#############################################################################
knitr::opts_chunk$set(echo = TRUE)    
library(lemon)
## Warning: package 'lemon' was built under R version 3.5.3
library(knitr)
knit_print.data.frame <- lemon_print
library(psych)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
library(scales)
## 
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
## 
##     alpha, rescale
library(gridExtra)
library(Amelia)
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 3.5.3
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2019 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(mice)
## Loading required package: lattice
## 
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(MASS)
library(leaps)
## Warning: package 'leaps' was built under R version 3.5.2
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
library(ggplot2)
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 3.5.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages -------------------------------------------------------- tidyverse 1.2.1 --
## v tibble  2.1.3     v purrr   0.2.5
## v tidyr   0.8.2     v dplyr   0.8.3
## v readr   1.1.1     v stringr 1.3.1
## v tibble  2.1.3     v forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## -- Conflicts ----------------------------------------------------------- tidyverse_conflicts() --
## x purrr::%||%()       masks lemon::%||%()
## x ggplot2::%+%()      masks psych::%+%()
## x scales::alpha()     masks ggplot2::alpha(), psych::alpha()
## x readr::col_factor() masks scales::col_factor()
## x dplyr::combine()    masks gridExtra::combine()
## x tidyr::complete()   masks mice::complete()
## x purrr::discard()    masks scales::discard()
## x dplyr::filter()     masks stats::filter()
## x dplyr::lag()        masks stats::lag()
## x dplyr::recode()     masks car::recode()
## x dplyr::select()     masks MASS::select()
## x purrr::some()       masks car::some()
library(caret)
## Warning: package 'caret' was built under R version 3.5.2
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.5.3
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded glmnet 2.0-18
library(fpp2)
## Warning: package 'fpp2' was built under R version 3.5.3
## Loading required package: forecast
## Loading required package: fma
## 
## Attaching package: 'fma'
## The following objects are masked from 'package:MASS':
## 
##     cement, housing, petrol
## Loading required package: expsmooth
library(hts)
## Warning: package 'hts' was built under R version 3.5.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.5.3
## corrplot 0.84 loaded
library(clusterGeneration)
## Warning: package 'clusterGeneration' was built under R version 3.5.3
library(xgboost)
## Warning: package 'xgboost' was built under R version 3.5.3
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 3.5.3
## ResourceSelection 0.3-5   2019-07-22
library(keras)
library(NbClust)
## Warning: package 'NbClust' was built under R version 3.5.2
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.5.3
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(scanstatistics)
## Warning: package 'scanstatistics' was built under R version 3.5.3
library(lattice)
library(latticeExtra)
## Warning: package 'latticeExtra' was built under R version 3.5.2
## Loading required package: RColorBrewer
## 
## Attaching package: 'latticeExtra'
## The following object is masked from 'package:ggplot2':
## 
##     layer
library(QuantPsyc)
## Warning: package 'QuantPsyc' was built under R version 3.5.3
## Loading required package: boot
## 
## Attaching package: 'boot'
## The following object is masked from 'package:car':
## 
##     logit
## The following object is masked from 'package:lattice':
## 
##     melanoma
## The following object is masked from 'package:psych':
## 
##     logit
## 
## Attaching package: 'QuantPsyc'
## The following object is masked from 'package:Matrix':
## 
##     norm
## The following object is masked from 'package:base':
## 
##     norm
#############################################################################

Read the Data, Investigate Missing Data

Read, investigate missing.

#############################################################################
mycsv=read.csv("D:/heart/thf2.csv")
mycsv$Acute_Days=as.numeric(mycsv$Acute_Days) #convert Acute Days to numeric
mycsv$Operating_Profit_Margin=as.numeric(mycsv$Operating_Profit_Margin)
missmap(mycsv, y.labels="", y.at=1, legend=TRUE) #missing map

#NOTE:  14 blanks regarding Ownership were filled with the mode.
#############################################################################

Train and Test Set

We could select any reasonable percentage here. For our purposes, we use 20% as a test set. We train using 12,783 observations. We withhold 3,196 for validation.

#############################################################################
set.seed(1234) #set a pseudo-random number
mys=sort(sample(1:nrow(mycsv),.2*nrow(mycsv), replace=FALSE))#sample 20%
train=mycsv[-mys,] #training set omits that 20%
test=mycsv[mys,] #test set is that 20%
#############################################################################

Impute the Missing

There are only 3% missing data, so mean imputation makes sense (conservative). Some argue that we are “spilling” by filling the missing with the whole dataset rather than calculating the means on a training set and using this for the test set. The proportion of missing is so small that this leakage not a significant problem. Still, we use the training set means to impute the test set missing.

#############################################################################
for(i in 2:14){ #replace 1% missing with means
  mysubs=mean(na.omit(train[,i]))
  train[is.na(train[,i]), i] <- mysubs #training set
  test[is.na(test[,i]), i] <- mysubs #test set
  mycsv[is.na(mycsv[,i]), i] <- mean(mycsv[,i], na.rm = TRUE) #all for desc.
  }
missmap(train, y.labels="", y.at=1, legend=TRUE)  #check missing train

missmap(test, y.labels="", y.at=1, legend = TRUE) #check missing test

missmap(mycsv,y.labels="", y.at=1, legend=TRUE) #check missing overall

#NOTE:  we used the mode for the missing values for ownership.
#############################################################################

Describe the Data

We provide descriptive statistics by year and overall using the total dataset.

#############################################################################
mydescribe=function(x){  #Function to Describe by Years
  mysub=mycsv[mycsv$Year_==x,]  #subset by year
  describe(mysub) #describe
}
mydescribe("Y2016") #Run Y2016
##                              vars    n         mean           sd
## Number_Diagnoses                1 5353      1024.26      1540.40
## Staffed_Beds                    2 5353       134.56       161.91
## Discharges                      3 5353      6163.25      9317.04
## ER_Visits                       4 5353     30714.65     31939.40
## Surgeries                       5 5353      5992.91      7505.83
## Acute_Days                      6 5353     31131.38     48359.51
## Net_Income                      7 5353  10526316.74  92603127.53
## Operating_Profit_Margin         8 5353        -0.03         1.12
## Cash.on.Hand                    9 5353  17370333.76 109340504.40
## Equity                         10 5353 144162296.54 498946461.48
## Affiliated_Physicians          11 5353       204.57       334.03
## Employees                      12 5353       886.32      1574.61
## Medicare                       13 5353         0.45         0.20
## Medicaid                       14 5353         0.09         0.10
## Year_*                         15 5353         1.00         0.00
## State_*                        16 5353        28.64        15.82
## Urban_Rural_*                  17 5353         1.58         0.49
## Ownership_*                    18 5353         2.33         0.77
## Medical_School_Affiliation_*   19 5353         3.58         0.83
## Type_*                         20 5353         5.46         2.11
##                                   median     trimmed         mad
## Number_Diagnoses                  339.00      685.27      440.33
## Staffed_Beds                       76.00      101.97       75.61
## Discharges                       2095.00     4160.70     2770.98
## ER_Visits                       26470.00    25441.89    24477.73
## Surgeries                        4767.00     4612.71     4289.16
## Acute_Days                      12587.00    20485.76    15493.17
## Net_Income                    1555170.00  5479701.56  6508301.17
## Operating_Profit_Margin            -0.02       -0.02        0.13
## Cash.on.Hand                  1727685.00  5101209.51  2639432.75
## Equity                       24749610.00 60671402.78 39079656.21
## Affiliated_Physicians              85.00      132.01      112.68
## Employees                         345.00      548.69      378.06
## Medicare                            0.43        0.45        0.20
## Medicaid                            0.07        0.07        0.06
## Year_*                              1.00        1.00        0.00
## State_*                            27.00       28.77       22.24
## Urban_Rural_*                       2.00        1.60        0.00
## Ownership_*                         3.00        2.42        0.00
## Medical_School_Affiliation_*        4.00        3.75        0.00
## Type_*                              7.00        5.70        0.00
##                                        min          max        range  skew
## Number_Diagnoses              1.100000e+01 1.653300e+04 1.652200e+04  2.87
## Staffed_Beds                  1.000000e+00 2.753000e+03 2.752000e+03  3.25
## Discharges                    1.000000e+00 1.293390e+05 1.293380e+05  3.24
## ER_Visits                     0.000000e+00 5.434570e+05 5.434570e+05  3.19
## Surgeries                     0.000000e+00 1.307410e+05 1.307410e+05  4.80
## Acute_Days                    1.400000e+01 6.614220e+05 6.614080e+05  3.69
## Net_Income                   -1.214339e+09 2.734304e+09 3.948643e+09  7.08
## Operating_Profit_Margin      -1.505000e+01 5.772000e+01 7.277000e+01 39.22
## Cash.on.Hand                 -2.512770e+09 3.880477e+09 6.393247e+09 11.24
## Equity                       -3.248783e+09 1.024298e+10 1.349176e+10 11.09
## Affiliated_Physicians         1.000000e+00 4.266000e+03 4.265000e+03  4.10
## Employees                     4.000000e+00 2.467300e+04 2.466900e+04  5.43
## Medicare                      0.000000e+00 9.800000e-01 9.800000e-01  0.31
## Medicaid                      0.000000e+00 9.600000e-01 9.600000e-01  2.86
## Year_*                        1.000000e+00 1.000000e+00 0.000000e+00   NaN
## State_*                       1.000000e+00 5.600000e+01 5.500000e+01 -0.04
## Urban_Rural_*                 1.000000e+00 2.000000e+00 1.000000e+00 -0.33
## Ownership_*                   1.000000e+00 3.000000e+00 2.000000e+00 -0.65
## Medical_School_Affiliation_*  1.000000e+00 5.000000e+00 4.000000e+00 -1.60
## Type_*                        1.000000e+00 7.000000e+00 6.000000e+00 -0.84
##                              kurtosis         se
## Number_Diagnoses                11.87      21.05
## Staffed_Beds                    21.92       2.21
## Discharges                      18.20     127.34
## ER_Visits                       23.60     436.54
## Surgeries                       45.16     102.59
## Acute_Days                      22.47     660.97
## Net_Income                     242.02 1265689.08
## Operating_Profit_Margin       1921.70       0.02
## Cash.on.Hand                   448.15 1494453.65
## Equity                         182.35 6819543.83
## Affiliated_Physicians           26.39       4.57
## Employees                       47.38      21.52
## Medicare                        -0.44       0.00
## Medicaid                        12.57       0.00
## Year_*                            NaN       0.00
## State_*                         -1.28       0.22
## Urban_Rural_*                   -1.89       0.01
## Ownership_*                     -1.04       0.01
## Medical_School_Affiliation_*     1.43       0.01
## Type_*                          -1.09       0.03
mydescribe("Y2017") #Run Y2017
##                              vars    n         mean           sd
## Number_Diagnoses                1 5328      1074.81      1628.65
## Staffed_Beds                    2 5328       134.39       162.43
## Discharges                      3 5328      6188.25      9333.42
## ER_Visits                       4 5328     30741.98     32094.02
## Surgeries                       5 5328      6010.48      7536.27
## Acute_Days                      6 5328     31255.05     48415.90
## Net_Income                      7 5328  10682310.72  92765354.87
## Operating_Profit_Margin         8 5328        -0.03         1.11
## Cash.on.Hand                    9 5328  17455050.60 109588184.49
## Equity                         10 5328 145331834.40 499421382.36
## Affiliated_Physicians          11 5328       204.76       334.32
## Employees                      12 5328       891.76      1585.19
## Medicare                       13 5328         0.45         0.20
## Medicaid                       14 5328         0.09         0.10
## Year_*                         15 5328         2.00         0.00
## State_*                        16 5328        28.61        15.84
## Urban_Rural_*                  17 5328         1.58         0.49
## Ownership_*                    18 5328         2.34         0.77
## Medical_School_Affiliation_*   19 5328         3.58         0.83
## Type_*                         20 5328         5.47         2.11
##                                   median     trimmed         mad
## Number_Diagnoses                  348.00      714.50      450.71
## Staffed_Beds                       74.00      101.60       72.65
## Discharges                       2095.50     4186.93     2764.31
## ER_Visits                       26302.00    25442.00    24457.71
## Surgeries                        4763.00     4623.06     4314.37
## Acute_Days                      12653.50    20622.97    15559.89
## Net_Income                    1567158.00  5585925.05  6564952.80
## Operating_Profit_Margin            -0.02       -0.02        0.13
## Cash.on.Hand                  1727641.50  5088121.76  2635056.11
## Equity                       25271947.00 61515156.64 39728710.32
## Affiliated_Physicians              84.00      132.11      111.19
## Employees                         345.00      551.75      376.58
## Medicare                            0.43        0.45        0.20
## Medicaid                            0.07        0.07        0.06
## Year_*                              2.00        2.00        0.00
## State_*                            27.00       28.73       22.24
## Urban_Rural_*                       2.00        1.60        0.00
## Ownership_*                         3.00        2.42        0.00
## Medical_School_Affiliation_*        4.00        3.74        0.00
## Type_*                              7.00        5.71        0.00
##                                        min          max        range  skew
## Number_Diagnoses              1.100000e+01 1.703100e+04 1.702000e+04  2.86
## Staffed_Beds                  1.000000e+00 2.753000e+03 2.752000e+03  3.23
## Discharges                    1.000000e+00 1.293390e+05 1.293380e+05  3.23
## ER_Visits                     0.000000e+00 5.434570e+05 5.434570e+05  3.20
## Surgeries                     0.000000e+00 1.307410e+05 1.307410e+05  4.78
## Acute_Days                    1.400000e+01 6.614220e+05 6.614080e+05  3.68
## Net_Income                   -1.214339e+09 2.734304e+09 3.948643e+09  7.06
## Operating_Profit_Margin      -1.505000e+01 5.772000e+01 7.277000e+01 40.77
## Cash.on.Hand                 -2.512770e+09 3.880477e+09 6.393247e+09 11.22
## Equity                       -3.248783e+09 1.024298e+10 1.349176e+10 11.06
## Affiliated_Physicians         1.000000e+00 4.266000e+03 4.265000e+03  4.09
## Employees                     4.000000e+00 2.467300e+04 2.466900e+04  5.41
## Medicare                      0.000000e+00 9.800000e-01 9.800000e-01  0.31
## Medicaid                      0.000000e+00 9.600000e-01 9.600000e-01  2.81
## Year_*                        2.000000e+00 2.000000e+00 0.000000e+00   NaN
## State_*                       1.000000e+00 5.600000e+01 5.500000e+01 -0.03
## Urban_Rural_*                 1.000000e+00 2.000000e+00 1.000000e+00 -0.33
## Ownership_*                   1.000000e+00 3.000000e+00 2.000000e+00 -0.66
## Medical_School_Affiliation_*  1.000000e+00 5.000000e+00 4.000000e+00 -1.61
## Type_*                        1.000000e+00 7.000000e+00 6.000000e+00 -0.85
##                              kurtosis         se
## Number_Diagnoses                11.65      22.31
## Staffed_Beds                    21.72       2.23
## Discharges                      18.11     127.87
## ER_Visits                       23.50     439.69
## Surgeries                       44.62     103.25
## Acute_Days                      22.40     663.29
## Net_Income                     241.38 1270877.53
## Operating_Profit_Margin       2003.92       0.02
## Cash.on.Hand                   446.13 1501348.88
## Equity                         181.96 6842030.79
## Affiliated_Physicians           26.30       4.58
## Employees                       46.77      21.72
## Medicare                        -0.44       0.00
## Medicaid                        12.10       0.00
## Year_*                            NaN       0.00
## State_*                         -1.28       0.22
## Urban_Rural_*                   -1.89       0.01
## Ownership_*                     -1.03       0.01
## Medical_School_Affiliation_*     1.41       0.01
## Type_*                          -1.08       0.03
mydescribe("Y2018") #Run Y2018
##                              vars    n         mean           sd
## Number_Diagnoses                1 5298      1099.37      1703.45
## Staffed_Beds                    2 5298       134.16       163.11
## Discharges                      3 5298      6212.14      9352.82
## ER_Visits                       4 5298     30762.97     32181.34
## Surgeries                       5 5298      6024.85      7541.14
## Acute_Days                      6 5298     31421.26     48575.61
## Net_Income                      7 5298  10680677.12  93002739.22
## Operating_Profit_Margin         8 5298        -0.04         0.78
## Cash.on.Hand                    9 5298  17536289.90 109859031.30
## Equity                         10 5298 146188576.77 500266829.97
## Affiliated_Physicians          11 5298       205.45       335.47
## Employees                      12 5298       895.54      1585.96
## Medicare                       13 5298         0.45         0.20
## Medicaid                       14 5298         0.09         0.10
## Year_*                         15 5298         3.00         0.00
## State_*                        16 5298        28.61        15.82
## Urban_Rural_*                  17 5298         1.58         0.49
## Ownership_*                    18 5298         2.33         0.78
## Medical_School_Affiliation_*   19 5298         3.57         0.83
## Type_*                         20 5298         5.47         2.11
##                                   median     trimmed         mad
## Number_Diagnoses                  347.00      719.66      452.19
## Staffed_Beds                       73.00      101.12       71.16
## Discharges                       2112.00     4208.92     2784.32
## ER_Visits                       26249.50    25438.41    24503.67
## Surgeries                        4806.50     4638.69     4320.30
## Acute_Days                      12719.00    20755.74    15668.86
## Net_Income                    1637961.00  5658277.62  6588999.83
## Operating_Profit_Margin            -0.02       -0.02        0.13
## Cash.on.Hand                  1804929.00  5147832.72  2740340.73
## Equity                       25669602.50 62036278.68 39881103.81
## Affiliated_Physicians              84.00      132.40      111.19
## Employees                         347.50      555.52      380.29
## Medicare                            0.43        0.44        0.20
## Medicaid                            0.07        0.07        0.06
## Year_*                              3.00        3.00        0.00
## State_*                            27.00       28.72       22.24
## Urban_Rural_*                       2.00        1.60        0.00
## Ownership_*                         3.00        2.42        0.00
## Medical_School_Affiliation_*        4.00        3.74        0.00
## Type_*                              7.00        5.71        0.00
##                                        min          max        range  skew
## Number_Diagnoses              1.100000e+01 1.785100e+04 1.784000e+04  3.04
## Staffed_Beds                  1.000000e+00 2.753000e+03 2.752000e+03  3.22
## Discharges                    1.000000e+00 1.293390e+05 1.293380e+05  3.23
## ER_Visits                     0.000000e+00 5.434570e+05 5.434570e+05  3.19
## Surgeries                     0.000000e+00 1.307410e+05 1.307410e+05  4.78
## Acute_Days                    4.800000e+01 6.614220e+05 6.613740e+05  3.66
## Net_Income                   -1.214339e+09 2.734304e+09 3.948643e+09  7.05
## Operating_Profit_Margin      -1.505000e+01 4.721000e+01 6.226000e+01 41.88
## Cash.on.Hand                 -2.512770e+09 3.880477e+09 6.393247e+09 11.19
## Equity                       -3.248783e+09 1.024298e+10 1.349176e+10 11.05
## Affiliated_Physicians         1.000000e+00 4.266000e+03 4.265000e+03  4.07
## Employees                     4.000000e+00 2.467300e+04 2.466900e+04  5.40
## Medicare                      0.000000e+00 9.800000e-01 9.800000e-01  0.32
## Medicaid                      0.000000e+00 9.600000e-01 9.600000e-01  2.82
## Year_*                        3.000000e+00 3.000000e+00 0.000000e+00   NaN
## State_*                       1.000000e+00 5.600000e+01 5.500000e+01 -0.03
## Urban_Rural_*                 1.000000e+00 2.000000e+00 1.000000e+00 -0.32
## Ownership_*                   1.000000e+00 3.000000e+00 2.000000e+00 -0.65
## Medical_School_Affiliation_*  1.000000e+00 5.000000e+00 4.000000e+00 -1.62
## Type_*                        1.000000e+00 7.000000e+00 6.000000e+00 -0.86
##                              kurtosis         se
## Number_Diagnoses                13.63      23.40
## Staffed_Beds                    21.48       2.24
## Discharges                      18.02     128.50
## ER_Visits                       23.38     442.13
## Surgeries                       44.71     103.61
## Acute_Days                      22.17     667.36
## Net_Income                     240.26 1277731.98
## Operating_Profit_Margin       2625.38       0.01
## Cash.on.Hand                   444.20 1509314.65
## Equity                         181.58 6872990.30
## Affiliated_Physicians           26.04       4.61
## Employees                       46.77      21.79
## Medicare                        -0.44       0.00
## Medicaid                        11.96       0.00
## Year_*                            NaN       0.00
## State_*                         -1.28       0.22
## Urban_Rural_*                   -1.89       0.01
## Ownership_*                     -1.05       0.01
## Medical_School_Affiliation_*     1.37       0.01
## Type_*                          -1.07       0.03
describe(mycsv) #describe data en toto
##                              vars     n         mean           sd
## Number_Diagnoses                1 15979      1066.02      1625.45
## Staffed_Beds                    2 15979       134.37       162.47
## Discharges                      3 15979      6187.80      9333.81
## ER_Visits                       4 15979     30739.79     32069.33
## Surgeries                       5 15979      6009.36      7527.24
## Acute_Days                      6 15979     31268.73     48447.16
## Net_Income                      7 15979  10629510.76  92784081.79
## Operating_Profit_Margin         8 15979        -0.03         1.02
## Cash.on.Hand                    9 15979  17453605.99 109588379.07
## Equity                         10 15979 145224098.30 499512322.58
## Affiliated_Physicians          11 15979       204.93       334.58
## Employees                      12 15979       891.19      1581.82
## Medicare                       13 15979         0.45         0.20
## Medicaid                       14 15979         0.09         0.10
## Year_*                         15 15979         2.00         0.82
## State_*                        16 15979        28.62        15.82
## Urban_Rural_*                  17 15979         1.58         0.49
## Ownership_*                    18 15979         2.33         0.77
## Medical_School_Affiliation_*   19 15979         3.58         0.83
## Type_*                         20 15979         5.47         2.11
##                                   median     trimmed         mad
## Number_Diagnoses                  344.00      706.09      446.26
## Staffed_Beds                       74.00      101.56       72.65
## Discharges                       2102.00     4184.72     2775.43
## ER_Visits                       26314.00    25439.27    24473.28
## Surgeries                        4777.00     4624.44     4317.33
## Acute_Days                      12655.00    20617.36    15573.23
## Net_Income                    1585375.00  5572344.00  6558153.60
## Operating_Profit_Margin            -0.02       -0.02        0.13
## Cash.on.Hand                  1758316.00  5110766.94  2676927.70
## Equity                       25182432.00 61386516.91 39543320.09
## Affiliated_Physicians              84.00      132.15      111.19
## Employees                         346.00      551.89      378.06
## Medicare                            0.43        0.45        0.20
## Medicaid                            0.07        0.07        0.06
## Year_*                              2.00        2.00        1.48
## State_*                            27.00       28.74       22.24
## Urban_Rural_*                       2.00        1.60        0.00
## Ownership_*                         3.00        2.42        0.00
## Medical_School_Affiliation_*        4.00        3.74        0.00
## Type_*                              7.00        5.71        0.00
##                                        min          max        range  skew
## Number_Diagnoses              1.100000e+01 1.785100e+04 1.784000e+04  2.94
## Staffed_Beds                  1.000000e+00 2.753000e+03 2.752000e+03  3.23
## Discharges                    1.000000e+00 1.293390e+05 1.293380e+05  3.24
## ER_Visits                     0.000000e+00 5.434570e+05 5.434570e+05  3.19
## Surgeries                     0.000000e+00 1.307410e+05 1.307410e+05  4.79
## Acute_Days                    1.400000e+01 6.614220e+05 6.614080e+05  3.68
## Net_Income                   -1.214339e+09 2.734304e+09 3.948643e+09  7.06
## Operating_Profit_Margin      -1.505000e+01 5.772000e+01 7.277000e+01 41.61
## Cash.on.Hand                 -2.512770e+09 3.880477e+09 6.393247e+09 11.22
## Equity                       -3.248783e+09 1.024298e+10 1.349176e+10 11.07
## Affiliated_Physicians         1.000000e+00 4.266000e+03 4.265000e+03  4.09
## Employees                     4.000000e+00 2.467300e+04 2.466900e+04  5.42
## Medicare                      0.000000e+00 9.800000e-01 9.800000e-01  0.31
## Medicaid                      0.000000e+00 9.600000e-01 9.600000e-01  2.83
## Year_*                        1.000000e+00 3.000000e+00 2.000000e+00  0.01
## State_*                       1.000000e+00 5.600000e+01 5.500000e+01 -0.04
## Urban_Rural_*                 1.000000e+00 2.000000e+00 1.000000e+00 -0.33
## Ownership_*                   1.000000e+00 3.000000e+00 2.000000e+00 -0.66
## Medical_School_Affiliation_*  1.000000e+00 5.000000e+00 4.000000e+00 -1.61
## Type_*                        1.000000e+00 7.000000e+00 6.000000e+00 -0.85
##                              kurtosis         se
## Number_Diagnoses                12.63      12.86
## Staffed_Beds                    21.71       1.29
## Discharges                      18.11      73.84
## ER_Visits                       23.50     253.70
## Surgeries                       44.84      59.55
## Acute_Days                      22.35     383.26
## Net_Income                     241.28  734004.42
## Operating_Profit_Margin       2205.58       0.01
## Cash.on.Hand                   446.27  866941.32
## Equity                         182.01 3951585.73
## Affiliated_Physicians           26.25       2.65
## Employees                       46.98      12.51
## Medicare                        -0.44       0.00
## Medicaid                        12.22       0.00
## Year_*                          -1.50       0.01
## State_*                         -1.28       0.13
## Urban_Rural_*                   -1.89       0.00
## Ownership_*                     -1.04       0.01
## Medical_School_Affiliation_*     1.41       0.01
## Type_*                          -1.08       0.02
#############################################################################

Quantitative Univariate Graphs

We look at histograms and boxplots for quantitative data.

#############################################################################
#time series graph
z=aggregate(Number_Diagnoses~Year_, mycsv,sum) #aggregate DV by year
myts=ts(z$Number_Diagnoses, start=c(2016), frequency=1) #produce time series
plot(myts, ylab="# Diagnoses", xlab="Year", col="red" ) #plot time series

#boxplot function and graphs
mynames=names(mycsv) #get the names for the data

for (i in 1:14){
  print(histogram(~mycsv[,i]|Year_, data=mycsv, breaks=31,xlab=mynames[i]))
  print(bwplot(~mycsv[,i]|Year_, data=mycsv, xlab=mynames[i]))
  } 

#############################################################################

Qualitative Univariate Graphs

We look at barplots for qualitative data.

#############################################################################
for (i in 15:20){
  p = ggplot(data = mycsv, aes(x = mycsv[,i], fill=Year_))
  p = p + geom_bar(stat='count')
  p = p + facet_grid(~Year_, scale='free_x')
  p = p + ggtitle(mynames[i])
  p = p + xlab('')
  p = p + ylab('Count')
  p = p + coord_flip()
  p = p + theme(axis.text.x = element_text(angle = 90, hjust = 1))
  print(p)
  barplot(sort(table(mycsv[, i]), decreasing = FALSE), main=mynames[i], col="red", horiz=TRUE, las=2, cex.names=.5)
} 

#############################################################################

Correlations Pre-Transform

We investigate hierarchically clustered correlation plots

#############################################################################
library(corrplot)
sub=cor(mycsv[,1:14])
corrplot(sub, type="upper",method="ellipse",order="hclust") #hierarchically clustered

#############################################################################

Make Positive Definite

With the obvious non-normality, we seek a solution that cannot reject multivariate non-normality using multivariate Box-Cox transformations. To do so, requires all observations to be strictly positive definite. We make them so. NOTE: we use the minimum and maximum value for ALL data, which leaks (at most), 2 data points per variable. This is non-problematic given the huge sample size.

#############################################################################
for (i in 1:14){
  absmin=abs(min(mycsv[,i]))+.01 # we are leaking only two values, min & max
  train[,i]=train[,i]+absmin
  test[,i]=test[,i]+absmin
}
#############################################################################

Box-Cox Multivariate Transformations

Next, we fit the training set quantitative (random) variables to the identity matrix.
NOTE: we assume that we have random effects regression, meaning that the IV’s are drawn from their own random variable for which we do not have complete observations / understanding. We apply these transformations to both the training and test set. The optimal lambdas derive from the training set and are applied to the test set to avoid leakage.

#############################################################################
myt=powerTransform(as.matrix(train[,1:14])~1)  #fit all quant to identity
print(myt$lambda) #print the labmdas
##        Number_Diagnoses            Staffed_Beds              Discharges 
##              0.23512308              0.17568479              0.28322338 
##               ER_Visits               Surgeries              Acute_Days 
##              0.45915176              0.38704739              0.24065596 
##              Net_Income Operating_Profit_Margin            Cash.on.Hand 
##              0.78772201              0.37289519              0.78961661 
##                  Equity   Affiliated_Physicians               Employees 
##              0.40550298              0.22890033              0.08555767 
##                Medicare                Medicaid 
##              0.64439836              0.05140977
for (i in 1:14){
  train[,i]=train[,i]^myt$lambda[i]#use lambdas
  test[,i]=test[,i]^myt$lambda[i] #use lambdas
  }
train=as.data.frame(train) #make into a data frame
test=as.data.frame(test)
#############################################################################

Regression Model

Next, we build a regression model on the training set.

#############################################################################
mylm=lm(Number_Diagnoses~., data=train) #build regression
mysummary=summary(mylm) #summarize
print("R^2:")
## [1] "R^2:"
mysummary$r.squared
## [1] 0.9235414
vif(mylm) #VIF
##                                  GVIF Df GVIF^(1/(2*Df))
## Staffed_Beds                10.208357  1        3.195052
## Discharges                  38.436480  1        6.199716
## ER_Visits                    3.349866  1        1.830264
## Surgeries                    2.909047  1        1.705593
## Acute_Days                  33.938858  1        5.825707
## Net_Income                   1.124942  1        1.060633
## Operating_Profit_Margin      1.047026  1        1.023243
## Cash.on.Hand                 1.093690  1        1.045797
## Equity                       1.471762  1        1.213162
## Affiliated_Physicians        6.821000  1        2.611704
## Employees                   11.930025  1        3.453987
## Medicare                     2.246103  1        1.498701
## Medicaid                     1.470336  1        1.212574
## Year_                        1.004981  2        1.001243
## State_                       5.692589 55        1.015936
## Urban_Rural_                 2.006832  1        1.416627
## Ownership_                   2.283780  2        1.229316
## Medical_School_Affiliation_  1.716926  4        1.069902
## Type_                       17.397600  6        1.268744
AIC(mylm) #Akaike Information Criterion
## [1] 15037.13
#Note: the highest adjusted vif's are 6.25 and 5.83. These are related.
#But they are stable in the coefficient estimates.
#############################################################################

Regression Plots

We evaluate the regression using traditional plots.

#############################################################################
plot(density(mylm$residuals)) #residual plots

par(ask=FALSE)
plot(mylm) #plots...Shows nonlinearity / heteroskadisticy.

#NOTE:  the sample size is too big for testing (always will be significant).
#############################################################################

Get the Standardized Coefficients

The standardized coefficients provide an outstanding method for comparing effects.

#############################################################################
#
mybetas=suppressWarnings(lm.beta(mylm)) #use the QuantPsych library to get the betas
new=data.frame(mybetas) #place the betas in a data frame
new$seq=seq(1,length(mybetas)) #get a sequence for the number of variables
new=new[order(-mybetas),c(1,2)] #sort descending importance
new$newseq=seq(1,length(mybetas)) #put a new sequence value in place
mytext=rownames(new) #get the rownames
new=data.frame(new) #restore to data frame
print(new[1:10, ]) #print the 10 most important terms
##                           mybetas seq newseq
## Type_Rehabilitation  1.621737e+06  82      1
## State_ND             1.262592e+05  46      2
## State_MT             7.848638e+04  44      3
## State_UT             6.533551e+04  63      4
## State_VI             3.553403e+04  65      5
## State_FL             1.560614e+04  25      6
## Type_DoD             3.493278e+01  79      7
## Type_Long Term       2.279329e+01  80      8
## State_HI             1.493424e+01  28      9
## State_CT             9.822156e+00  22     10
#############################################################################

Predict the Test Set

At the end of the day, this is what matters. How well do we predict the test set with linear regression? We run additional models in Python.

#############################################################################
mypredict = predict (mylm, test, interval="prediction") #predict on test set
mylm2=lm(test$Number_Diagnoses~mypredict[,1]) #regress actual vs. forecast
mytestsummary=summary(mylm2) #summary
print("R^2:")
## [1] "R^2:"
mytestsummary$r.squared
## [1] 0.9181246
newx <- seq(0, 12, length.out=nrow(test)) #let's set up for plotting
plot(test$Number_Diagnoses~mypredict[, 1], 
     main="# Diagnoses, Actual vs. Predicted, Blind Dataset w. Prediction Interval",
     xlab="Prediction", ylab="Actual")
abline(lm(test$Number_Diagnoses~mypredict[,1]), col="red") #add a fit line
abline(lm(mypredict[,2]~mypredict[,1]), col="blue")
abline(lm(mypredict[,3]~mypredict[,1]), col="blue")

#############################################################################

Re-run on the Entire Dataset

We re-run using all the data.

#############################################################################
for (i in 1:14){
  mycsv[,i]=mycsv[,i]+abs(min(mycsv[,i]))+.01  #positive definite transform
 }
mynewt=powerTransform(as.matrix(mycsv[,1:14])~1)  #fit all quant to identity
for (i in 1:14){
  mycsv[,i]=mycsv[,i]^mynewt$lambda[i]#use lambdas to transform
}
mylmfinal=lm(Number_Diagnoses~., data=mycsv) #build full regression
mysummary=summary(mylm) #summarize
print("R^2:")
## [1] "R^2:"
mysummary$r.squared  #get the R^2
## [1] 0.9235414
mypredict2 = predict (mylmfinal, mycsv, interval="prediction") #get the predictions

plot(mycsv$Number_Diagnoses~mypredict2[, 1], 
     main="# Diagnoses, Actual vs. Predicted, Full Datas w. Prediction Interval",
     xlab="Prediction", ylab="Actual")
abline(lm(mycsv$Number_Diagnoses~mypredict2[,1]), col="red") #add a fit line
abline(lm(mypredict2[,2]~mypredict2[,1]), col="blue")
abline(lm(mypredict2[,3]~mypredict2[,1]), col="blue")

#############################################################################