This analysis attempts to predict the number of diagnoses of heart failure. The dependent variable is the number of diagnoses. The unit of analysis is the hospital level by year. Independent variable groups include financial, workload, technical, geospatial, and temporal variables.
Load the printing functions and all libraries
#############################################################################
knitr::opts_chunk$set(echo = TRUE)
library(lemon)
## Warning: package 'lemon' was built under R version 3.5.3
library(knitr)
knit_print.data.frame <- lemon_print
library(psych)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(scales)
##
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
##
## alpha, rescale
library(gridExtra)
library(Amelia)
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 3.5.3
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2019 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(mice)
## Loading required package: lattice
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(MASS)
library(leaps)
## Warning: package 'leaps' was built under R version 3.5.2
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
library(ggplot2)
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 3.5.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages -------------------------------------------------------- tidyverse 1.2.1 --
## v tibble 2.1.3 v purrr 0.2.5
## v tidyr 0.8.2 v dplyr 0.8.3
## v readr 1.1.1 v stringr 1.3.1
## v tibble 2.1.3 v forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## -- Conflicts ----------------------------------------------------------- tidyverse_conflicts() --
## x purrr::%||%() masks lemon::%||%()
## x ggplot2::%+%() masks psych::%+%()
## x scales::alpha() masks ggplot2::alpha(), psych::alpha()
## x readr::col_factor() masks scales::col_factor()
## x dplyr::combine() masks gridExtra::combine()
## x tidyr::complete() masks mice::complete()
## x purrr::discard() masks scales::discard()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::recode() masks car::recode()
## x dplyr::select() masks MASS::select()
## x purrr::some() masks car::some()
library(caret)
## Warning: package 'caret' was built under R version 3.5.2
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.5.3
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded glmnet 2.0-18
library(fpp2)
## Warning: package 'fpp2' was built under R version 3.5.3
## Loading required package: forecast
## Loading required package: fma
##
## Attaching package: 'fma'
## The following objects are masked from 'package:MASS':
##
## cement, housing, petrol
## Loading required package: expsmooth
library(hts)
## Warning: package 'hts' was built under R version 3.5.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.5.3
## corrplot 0.84 loaded
library(clusterGeneration)
## Warning: package 'clusterGeneration' was built under R version 3.5.3
library(xgboost)
## Warning: package 'xgboost' was built under R version 3.5.3
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 3.5.3
## ResourceSelection 0.3-5 2019-07-22
library(keras)
library(NbClust)
## Warning: package 'NbClust' was built under R version 3.5.2
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.5.3
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(scanstatistics)
## Warning: package 'scanstatistics' was built under R version 3.5.3
library(lattice)
library(latticeExtra)
## Warning: package 'latticeExtra' was built under R version 3.5.2
## Loading required package: RColorBrewer
##
## Attaching package: 'latticeExtra'
## The following object is masked from 'package:ggplot2':
##
## layer
library(QuantPsyc)
## Warning: package 'QuantPsyc' was built under R version 3.5.3
## Loading required package: boot
##
## Attaching package: 'boot'
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:lattice':
##
## melanoma
## The following object is masked from 'package:psych':
##
## logit
##
## Attaching package: 'QuantPsyc'
## The following object is masked from 'package:Matrix':
##
## norm
## The following object is masked from 'package:base':
##
## norm
#############################################################################
Read, investigate missing.
#############################################################################
mycsv=read.csv("D:/heart/thf2.csv")
mycsv$Acute_Days=as.numeric(mycsv$Acute_Days) #convert Acute Days to numeric
mycsv$Operating_Profit_Margin=as.numeric(mycsv$Operating_Profit_Margin)
missmap(mycsv, y.labels="", y.at=1, legend=TRUE) #missing map
#NOTE: 14 blanks regarding Ownership were filled with the mode.
#############################################################################
We could select any reasonable percentage here. For our purposes, we use 20% as a test set. We train using 12,783 observations. We withhold 3,196 for validation.
#############################################################################
set.seed(1234) #set a pseudo-random number
mys=sort(sample(1:nrow(mycsv),.2*nrow(mycsv), replace=FALSE))#sample 20%
train=mycsv[-mys,] #training set omits that 20%
test=mycsv[mys,] #test set is that 20%
#############################################################################
There are only 3% missing data, so mean imputation makes sense (conservative). Some argue that we are “spilling” by filling the missing with the whole dataset rather than calculating the means on a training set and using this for the test set. The proportion of missing is so small that this leakage not a significant problem. Still, we use the training set means to impute the test set missing.
#############################################################################
for(i in 2:14){ #replace 1% missing with means
mysubs=mean(na.omit(train[,i]))
train[is.na(train[,i]), i] <- mysubs #training set
test[is.na(test[,i]), i] <- mysubs #test set
mycsv[is.na(mycsv[,i]), i] <- mean(mycsv[,i], na.rm = TRUE) #all for desc.
}
missmap(train, y.labels="", y.at=1, legend=TRUE) #check missing train
missmap(test, y.labels="", y.at=1, legend = TRUE) #check missing test
missmap(mycsv,y.labels="", y.at=1, legend=TRUE) #check missing overall
#NOTE: we used the mode for the missing values for ownership.
#############################################################################
We provide descriptive statistics by year and overall using the total dataset.
#############################################################################
mydescribe=function(x){ #Function to Describe by Years
mysub=mycsv[mycsv$Year_==x,] #subset by year
describe(mysub) #describe
}
mydescribe("Y2016") #Run Y2016
## vars n mean sd
## Number_Diagnoses 1 5353 1024.26 1540.40
## Staffed_Beds 2 5353 134.56 161.91
## Discharges 3 5353 6163.25 9317.04
## ER_Visits 4 5353 30714.65 31939.40
## Surgeries 5 5353 5992.91 7505.83
## Acute_Days 6 5353 31131.38 48359.51
## Net_Income 7 5353 10526316.74 92603127.53
## Operating_Profit_Margin 8 5353 -0.03 1.12
## Cash.on.Hand 9 5353 17370333.76 109340504.40
## Equity 10 5353 144162296.54 498946461.48
## Affiliated_Physicians 11 5353 204.57 334.03
## Employees 12 5353 886.32 1574.61
## Medicare 13 5353 0.45 0.20
## Medicaid 14 5353 0.09 0.10
## Year_* 15 5353 1.00 0.00
## State_* 16 5353 28.64 15.82
## Urban_Rural_* 17 5353 1.58 0.49
## Ownership_* 18 5353 2.33 0.77
## Medical_School_Affiliation_* 19 5353 3.58 0.83
## Type_* 20 5353 5.46 2.11
## median trimmed mad
## Number_Diagnoses 339.00 685.27 440.33
## Staffed_Beds 76.00 101.97 75.61
## Discharges 2095.00 4160.70 2770.98
## ER_Visits 26470.00 25441.89 24477.73
## Surgeries 4767.00 4612.71 4289.16
## Acute_Days 12587.00 20485.76 15493.17
## Net_Income 1555170.00 5479701.56 6508301.17
## Operating_Profit_Margin -0.02 -0.02 0.13
## Cash.on.Hand 1727685.00 5101209.51 2639432.75
## Equity 24749610.00 60671402.78 39079656.21
## Affiliated_Physicians 85.00 132.01 112.68
## Employees 345.00 548.69 378.06
## Medicare 0.43 0.45 0.20
## Medicaid 0.07 0.07 0.06
## Year_* 1.00 1.00 0.00
## State_* 27.00 28.77 22.24
## Urban_Rural_* 2.00 1.60 0.00
## Ownership_* 3.00 2.42 0.00
## Medical_School_Affiliation_* 4.00 3.75 0.00
## Type_* 7.00 5.70 0.00
## min max range skew
## Number_Diagnoses 1.100000e+01 1.653300e+04 1.652200e+04 2.87
## Staffed_Beds 1.000000e+00 2.753000e+03 2.752000e+03 3.25
## Discharges 1.000000e+00 1.293390e+05 1.293380e+05 3.24
## ER_Visits 0.000000e+00 5.434570e+05 5.434570e+05 3.19
## Surgeries 0.000000e+00 1.307410e+05 1.307410e+05 4.80
## Acute_Days 1.400000e+01 6.614220e+05 6.614080e+05 3.69
## Net_Income -1.214339e+09 2.734304e+09 3.948643e+09 7.08
## Operating_Profit_Margin -1.505000e+01 5.772000e+01 7.277000e+01 39.22
## Cash.on.Hand -2.512770e+09 3.880477e+09 6.393247e+09 11.24
## Equity -3.248783e+09 1.024298e+10 1.349176e+10 11.09
## Affiliated_Physicians 1.000000e+00 4.266000e+03 4.265000e+03 4.10
## Employees 4.000000e+00 2.467300e+04 2.466900e+04 5.43
## Medicare 0.000000e+00 9.800000e-01 9.800000e-01 0.31
## Medicaid 0.000000e+00 9.600000e-01 9.600000e-01 2.86
## Year_* 1.000000e+00 1.000000e+00 0.000000e+00 NaN
## State_* 1.000000e+00 5.600000e+01 5.500000e+01 -0.04
## Urban_Rural_* 1.000000e+00 2.000000e+00 1.000000e+00 -0.33
## Ownership_* 1.000000e+00 3.000000e+00 2.000000e+00 -0.65
## Medical_School_Affiliation_* 1.000000e+00 5.000000e+00 4.000000e+00 -1.60
## Type_* 1.000000e+00 7.000000e+00 6.000000e+00 -0.84
## kurtosis se
## Number_Diagnoses 11.87 21.05
## Staffed_Beds 21.92 2.21
## Discharges 18.20 127.34
## ER_Visits 23.60 436.54
## Surgeries 45.16 102.59
## Acute_Days 22.47 660.97
## Net_Income 242.02 1265689.08
## Operating_Profit_Margin 1921.70 0.02
## Cash.on.Hand 448.15 1494453.65
## Equity 182.35 6819543.83
## Affiliated_Physicians 26.39 4.57
## Employees 47.38 21.52
## Medicare -0.44 0.00
## Medicaid 12.57 0.00
## Year_* NaN 0.00
## State_* -1.28 0.22
## Urban_Rural_* -1.89 0.01
## Ownership_* -1.04 0.01
## Medical_School_Affiliation_* 1.43 0.01
## Type_* -1.09 0.03
mydescribe("Y2017") #Run Y2017
## vars n mean sd
## Number_Diagnoses 1 5328 1074.81 1628.65
## Staffed_Beds 2 5328 134.39 162.43
## Discharges 3 5328 6188.25 9333.42
## ER_Visits 4 5328 30741.98 32094.02
## Surgeries 5 5328 6010.48 7536.27
## Acute_Days 6 5328 31255.05 48415.90
## Net_Income 7 5328 10682310.72 92765354.87
## Operating_Profit_Margin 8 5328 -0.03 1.11
## Cash.on.Hand 9 5328 17455050.60 109588184.49
## Equity 10 5328 145331834.40 499421382.36
## Affiliated_Physicians 11 5328 204.76 334.32
## Employees 12 5328 891.76 1585.19
## Medicare 13 5328 0.45 0.20
## Medicaid 14 5328 0.09 0.10
## Year_* 15 5328 2.00 0.00
## State_* 16 5328 28.61 15.84
## Urban_Rural_* 17 5328 1.58 0.49
## Ownership_* 18 5328 2.34 0.77
## Medical_School_Affiliation_* 19 5328 3.58 0.83
## Type_* 20 5328 5.47 2.11
## median trimmed mad
## Number_Diagnoses 348.00 714.50 450.71
## Staffed_Beds 74.00 101.60 72.65
## Discharges 2095.50 4186.93 2764.31
## ER_Visits 26302.00 25442.00 24457.71
## Surgeries 4763.00 4623.06 4314.37
## Acute_Days 12653.50 20622.97 15559.89
## Net_Income 1567158.00 5585925.05 6564952.80
## Operating_Profit_Margin -0.02 -0.02 0.13
## Cash.on.Hand 1727641.50 5088121.76 2635056.11
## Equity 25271947.00 61515156.64 39728710.32
## Affiliated_Physicians 84.00 132.11 111.19
## Employees 345.00 551.75 376.58
## Medicare 0.43 0.45 0.20
## Medicaid 0.07 0.07 0.06
## Year_* 2.00 2.00 0.00
## State_* 27.00 28.73 22.24
## Urban_Rural_* 2.00 1.60 0.00
## Ownership_* 3.00 2.42 0.00
## Medical_School_Affiliation_* 4.00 3.74 0.00
## Type_* 7.00 5.71 0.00
## min max range skew
## Number_Diagnoses 1.100000e+01 1.703100e+04 1.702000e+04 2.86
## Staffed_Beds 1.000000e+00 2.753000e+03 2.752000e+03 3.23
## Discharges 1.000000e+00 1.293390e+05 1.293380e+05 3.23
## ER_Visits 0.000000e+00 5.434570e+05 5.434570e+05 3.20
## Surgeries 0.000000e+00 1.307410e+05 1.307410e+05 4.78
## Acute_Days 1.400000e+01 6.614220e+05 6.614080e+05 3.68
## Net_Income -1.214339e+09 2.734304e+09 3.948643e+09 7.06
## Operating_Profit_Margin -1.505000e+01 5.772000e+01 7.277000e+01 40.77
## Cash.on.Hand -2.512770e+09 3.880477e+09 6.393247e+09 11.22
## Equity -3.248783e+09 1.024298e+10 1.349176e+10 11.06
## Affiliated_Physicians 1.000000e+00 4.266000e+03 4.265000e+03 4.09
## Employees 4.000000e+00 2.467300e+04 2.466900e+04 5.41
## Medicare 0.000000e+00 9.800000e-01 9.800000e-01 0.31
## Medicaid 0.000000e+00 9.600000e-01 9.600000e-01 2.81
## Year_* 2.000000e+00 2.000000e+00 0.000000e+00 NaN
## State_* 1.000000e+00 5.600000e+01 5.500000e+01 -0.03
## Urban_Rural_* 1.000000e+00 2.000000e+00 1.000000e+00 -0.33
## Ownership_* 1.000000e+00 3.000000e+00 2.000000e+00 -0.66
## Medical_School_Affiliation_* 1.000000e+00 5.000000e+00 4.000000e+00 -1.61
## Type_* 1.000000e+00 7.000000e+00 6.000000e+00 -0.85
## kurtosis se
## Number_Diagnoses 11.65 22.31
## Staffed_Beds 21.72 2.23
## Discharges 18.11 127.87
## ER_Visits 23.50 439.69
## Surgeries 44.62 103.25
## Acute_Days 22.40 663.29
## Net_Income 241.38 1270877.53
## Operating_Profit_Margin 2003.92 0.02
## Cash.on.Hand 446.13 1501348.88
## Equity 181.96 6842030.79
## Affiliated_Physicians 26.30 4.58
## Employees 46.77 21.72
## Medicare -0.44 0.00
## Medicaid 12.10 0.00
## Year_* NaN 0.00
## State_* -1.28 0.22
## Urban_Rural_* -1.89 0.01
## Ownership_* -1.03 0.01
## Medical_School_Affiliation_* 1.41 0.01
## Type_* -1.08 0.03
mydescribe("Y2018") #Run Y2018
## vars n mean sd
## Number_Diagnoses 1 5298 1099.37 1703.45
## Staffed_Beds 2 5298 134.16 163.11
## Discharges 3 5298 6212.14 9352.82
## ER_Visits 4 5298 30762.97 32181.34
## Surgeries 5 5298 6024.85 7541.14
## Acute_Days 6 5298 31421.26 48575.61
## Net_Income 7 5298 10680677.12 93002739.22
## Operating_Profit_Margin 8 5298 -0.04 0.78
## Cash.on.Hand 9 5298 17536289.90 109859031.30
## Equity 10 5298 146188576.77 500266829.97
## Affiliated_Physicians 11 5298 205.45 335.47
## Employees 12 5298 895.54 1585.96
## Medicare 13 5298 0.45 0.20
## Medicaid 14 5298 0.09 0.10
## Year_* 15 5298 3.00 0.00
## State_* 16 5298 28.61 15.82
## Urban_Rural_* 17 5298 1.58 0.49
## Ownership_* 18 5298 2.33 0.78
## Medical_School_Affiliation_* 19 5298 3.57 0.83
## Type_* 20 5298 5.47 2.11
## median trimmed mad
## Number_Diagnoses 347.00 719.66 452.19
## Staffed_Beds 73.00 101.12 71.16
## Discharges 2112.00 4208.92 2784.32
## ER_Visits 26249.50 25438.41 24503.67
## Surgeries 4806.50 4638.69 4320.30
## Acute_Days 12719.00 20755.74 15668.86
## Net_Income 1637961.00 5658277.62 6588999.83
## Operating_Profit_Margin -0.02 -0.02 0.13
## Cash.on.Hand 1804929.00 5147832.72 2740340.73
## Equity 25669602.50 62036278.68 39881103.81
## Affiliated_Physicians 84.00 132.40 111.19
## Employees 347.50 555.52 380.29
## Medicare 0.43 0.44 0.20
## Medicaid 0.07 0.07 0.06
## Year_* 3.00 3.00 0.00
## State_* 27.00 28.72 22.24
## Urban_Rural_* 2.00 1.60 0.00
## Ownership_* 3.00 2.42 0.00
## Medical_School_Affiliation_* 4.00 3.74 0.00
## Type_* 7.00 5.71 0.00
## min max range skew
## Number_Diagnoses 1.100000e+01 1.785100e+04 1.784000e+04 3.04
## Staffed_Beds 1.000000e+00 2.753000e+03 2.752000e+03 3.22
## Discharges 1.000000e+00 1.293390e+05 1.293380e+05 3.23
## ER_Visits 0.000000e+00 5.434570e+05 5.434570e+05 3.19
## Surgeries 0.000000e+00 1.307410e+05 1.307410e+05 4.78
## Acute_Days 4.800000e+01 6.614220e+05 6.613740e+05 3.66
## Net_Income -1.214339e+09 2.734304e+09 3.948643e+09 7.05
## Operating_Profit_Margin -1.505000e+01 4.721000e+01 6.226000e+01 41.88
## Cash.on.Hand -2.512770e+09 3.880477e+09 6.393247e+09 11.19
## Equity -3.248783e+09 1.024298e+10 1.349176e+10 11.05
## Affiliated_Physicians 1.000000e+00 4.266000e+03 4.265000e+03 4.07
## Employees 4.000000e+00 2.467300e+04 2.466900e+04 5.40
## Medicare 0.000000e+00 9.800000e-01 9.800000e-01 0.32
## Medicaid 0.000000e+00 9.600000e-01 9.600000e-01 2.82
## Year_* 3.000000e+00 3.000000e+00 0.000000e+00 NaN
## State_* 1.000000e+00 5.600000e+01 5.500000e+01 -0.03
## Urban_Rural_* 1.000000e+00 2.000000e+00 1.000000e+00 -0.32
## Ownership_* 1.000000e+00 3.000000e+00 2.000000e+00 -0.65
## Medical_School_Affiliation_* 1.000000e+00 5.000000e+00 4.000000e+00 -1.62
## Type_* 1.000000e+00 7.000000e+00 6.000000e+00 -0.86
## kurtosis se
## Number_Diagnoses 13.63 23.40
## Staffed_Beds 21.48 2.24
## Discharges 18.02 128.50
## ER_Visits 23.38 442.13
## Surgeries 44.71 103.61
## Acute_Days 22.17 667.36
## Net_Income 240.26 1277731.98
## Operating_Profit_Margin 2625.38 0.01
## Cash.on.Hand 444.20 1509314.65
## Equity 181.58 6872990.30
## Affiliated_Physicians 26.04 4.61
## Employees 46.77 21.79
## Medicare -0.44 0.00
## Medicaid 11.96 0.00
## Year_* NaN 0.00
## State_* -1.28 0.22
## Urban_Rural_* -1.89 0.01
## Ownership_* -1.05 0.01
## Medical_School_Affiliation_* 1.37 0.01
## Type_* -1.07 0.03
describe(mycsv) #describe data en toto
## vars n mean sd
## Number_Diagnoses 1 15979 1066.02 1625.45
## Staffed_Beds 2 15979 134.37 162.47
## Discharges 3 15979 6187.80 9333.81
## ER_Visits 4 15979 30739.79 32069.33
## Surgeries 5 15979 6009.36 7527.24
## Acute_Days 6 15979 31268.73 48447.16
## Net_Income 7 15979 10629510.76 92784081.79
## Operating_Profit_Margin 8 15979 -0.03 1.02
## Cash.on.Hand 9 15979 17453605.99 109588379.07
## Equity 10 15979 145224098.30 499512322.58
## Affiliated_Physicians 11 15979 204.93 334.58
## Employees 12 15979 891.19 1581.82
## Medicare 13 15979 0.45 0.20
## Medicaid 14 15979 0.09 0.10
## Year_* 15 15979 2.00 0.82
## State_* 16 15979 28.62 15.82
## Urban_Rural_* 17 15979 1.58 0.49
## Ownership_* 18 15979 2.33 0.77
## Medical_School_Affiliation_* 19 15979 3.58 0.83
## Type_* 20 15979 5.47 2.11
## median trimmed mad
## Number_Diagnoses 344.00 706.09 446.26
## Staffed_Beds 74.00 101.56 72.65
## Discharges 2102.00 4184.72 2775.43
## ER_Visits 26314.00 25439.27 24473.28
## Surgeries 4777.00 4624.44 4317.33
## Acute_Days 12655.00 20617.36 15573.23
## Net_Income 1585375.00 5572344.00 6558153.60
## Operating_Profit_Margin -0.02 -0.02 0.13
## Cash.on.Hand 1758316.00 5110766.94 2676927.70
## Equity 25182432.00 61386516.91 39543320.09
## Affiliated_Physicians 84.00 132.15 111.19
## Employees 346.00 551.89 378.06
## Medicare 0.43 0.45 0.20
## Medicaid 0.07 0.07 0.06
## Year_* 2.00 2.00 1.48
## State_* 27.00 28.74 22.24
## Urban_Rural_* 2.00 1.60 0.00
## Ownership_* 3.00 2.42 0.00
## Medical_School_Affiliation_* 4.00 3.74 0.00
## Type_* 7.00 5.71 0.00
## min max range skew
## Number_Diagnoses 1.100000e+01 1.785100e+04 1.784000e+04 2.94
## Staffed_Beds 1.000000e+00 2.753000e+03 2.752000e+03 3.23
## Discharges 1.000000e+00 1.293390e+05 1.293380e+05 3.24
## ER_Visits 0.000000e+00 5.434570e+05 5.434570e+05 3.19
## Surgeries 0.000000e+00 1.307410e+05 1.307410e+05 4.79
## Acute_Days 1.400000e+01 6.614220e+05 6.614080e+05 3.68
## Net_Income -1.214339e+09 2.734304e+09 3.948643e+09 7.06
## Operating_Profit_Margin -1.505000e+01 5.772000e+01 7.277000e+01 41.61
## Cash.on.Hand -2.512770e+09 3.880477e+09 6.393247e+09 11.22
## Equity -3.248783e+09 1.024298e+10 1.349176e+10 11.07
## Affiliated_Physicians 1.000000e+00 4.266000e+03 4.265000e+03 4.09
## Employees 4.000000e+00 2.467300e+04 2.466900e+04 5.42
## Medicare 0.000000e+00 9.800000e-01 9.800000e-01 0.31
## Medicaid 0.000000e+00 9.600000e-01 9.600000e-01 2.83
## Year_* 1.000000e+00 3.000000e+00 2.000000e+00 0.01
## State_* 1.000000e+00 5.600000e+01 5.500000e+01 -0.04
## Urban_Rural_* 1.000000e+00 2.000000e+00 1.000000e+00 -0.33
## Ownership_* 1.000000e+00 3.000000e+00 2.000000e+00 -0.66
## Medical_School_Affiliation_* 1.000000e+00 5.000000e+00 4.000000e+00 -1.61
## Type_* 1.000000e+00 7.000000e+00 6.000000e+00 -0.85
## kurtosis se
## Number_Diagnoses 12.63 12.86
## Staffed_Beds 21.71 1.29
## Discharges 18.11 73.84
## ER_Visits 23.50 253.70
## Surgeries 44.84 59.55
## Acute_Days 22.35 383.26
## Net_Income 241.28 734004.42
## Operating_Profit_Margin 2205.58 0.01
## Cash.on.Hand 446.27 866941.32
## Equity 182.01 3951585.73
## Affiliated_Physicians 26.25 2.65
## Employees 46.98 12.51
## Medicare -0.44 0.00
## Medicaid 12.22 0.00
## Year_* -1.50 0.01
## State_* -1.28 0.13
## Urban_Rural_* -1.89 0.00
## Ownership_* -1.04 0.01
## Medical_School_Affiliation_* 1.41 0.01
## Type_* -1.08 0.02
#############################################################################
We look at histograms and boxplots for quantitative data.
#############################################################################
#time series graph
z=aggregate(Number_Diagnoses~Year_, mycsv,sum) #aggregate DV by year
myts=ts(z$Number_Diagnoses, start=c(2016), frequency=1) #produce time series
plot(myts, ylab="# Diagnoses", xlab="Year", col="red" ) #plot time series
#boxplot function and graphs
mynames=names(mycsv) #get the names for the data
for (i in 1:14){
print(histogram(~mycsv[,i]|Year_, data=mycsv, breaks=31,xlab=mynames[i]))
print(bwplot(~mycsv[,i]|Year_, data=mycsv, xlab=mynames[i]))
}
#############################################################################
We look at barplots for qualitative data.
#############################################################################
for (i in 15:20){
p = ggplot(data = mycsv, aes(x = mycsv[,i], fill=Year_))
p = p + geom_bar(stat='count')
p = p + facet_grid(~Year_, scale='free_x')
p = p + ggtitle(mynames[i])
p = p + xlab('')
p = p + ylab('Count')
p = p + coord_flip()
p = p + theme(axis.text.x = element_text(angle = 90, hjust = 1))
print(p)
barplot(sort(table(mycsv[, i]), decreasing = FALSE), main=mynames[i], col="red", horiz=TRUE, las=2, cex.names=.5)
}
#############################################################################
We investigate hierarchically clustered correlation plots
#############################################################################
library(corrplot)
sub=cor(mycsv[,1:14])
corrplot(sub, type="upper",method="ellipse",order="hclust") #hierarchically clustered
#############################################################################
With the obvious non-normality, we seek a solution that cannot reject multivariate non-normality using multivariate Box-Cox transformations. To do so, requires all observations to be strictly positive definite. We make them so. NOTE: we use the minimum and maximum value for ALL data, which leaks (at most), 2 data points per variable. This is non-problematic given the huge sample size.
#############################################################################
for (i in 1:14){
absmin=abs(min(mycsv[,i]))+.01 # we are leaking only two values, min & max
train[,i]=train[,i]+absmin
test[,i]=test[,i]+absmin
}
#############################################################################
Next, we fit the training set quantitative (random) variables to the identity matrix.
NOTE: we assume that we have random effects regression, meaning that the IV’s are drawn from their own random variable for which we do not have complete observations / understanding. We apply these transformations to both the training and test set. The optimal lambdas derive from the training set and are applied to the test set to avoid leakage.
#############################################################################
myt=powerTransform(as.matrix(train[,1:14])~1) #fit all quant to identity
print(myt$lambda) #print the labmdas
## Number_Diagnoses Staffed_Beds Discharges
## 0.23512308 0.17568479 0.28322338
## ER_Visits Surgeries Acute_Days
## 0.45915176 0.38704739 0.24065596
## Net_Income Operating_Profit_Margin Cash.on.Hand
## 0.78772201 0.37289519 0.78961661
## Equity Affiliated_Physicians Employees
## 0.40550298 0.22890033 0.08555767
## Medicare Medicaid
## 0.64439836 0.05140977
for (i in 1:14){
train[,i]=train[,i]^myt$lambda[i]#use lambdas
test[,i]=test[,i]^myt$lambda[i] #use lambdas
}
train=as.data.frame(train) #make into a data frame
test=as.data.frame(test)
#############################################################################
Next, we build a regression model on the training set.
#############################################################################
mylm=lm(Number_Diagnoses~., data=train) #build regression
mysummary=summary(mylm) #summarize
print("R^2:")
## [1] "R^2:"
mysummary$r.squared
## [1] 0.9235414
vif(mylm) #VIF
## GVIF Df GVIF^(1/(2*Df))
## Staffed_Beds 10.208357 1 3.195052
## Discharges 38.436480 1 6.199716
## ER_Visits 3.349866 1 1.830264
## Surgeries 2.909047 1 1.705593
## Acute_Days 33.938858 1 5.825707
## Net_Income 1.124942 1 1.060633
## Operating_Profit_Margin 1.047026 1 1.023243
## Cash.on.Hand 1.093690 1 1.045797
## Equity 1.471762 1 1.213162
## Affiliated_Physicians 6.821000 1 2.611704
## Employees 11.930025 1 3.453987
## Medicare 2.246103 1 1.498701
## Medicaid 1.470336 1 1.212574
## Year_ 1.004981 2 1.001243
## State_ 5.692589 55 1.015936
## Urban_Rural_ 2.006832 1 1.416627
## Ownership_ 2.283780 2 1.229316
## Medical_School_Affiliation_ 1.716926 4 1.069902
## Type_ 17.397600 6 1.268744
AIC(mylm) #Akaike Information Criterion
## [1] 15037.13
#Note: the highest adjusted vif's are 6.25 and 5.83. These are related.
#But they are stable in the coefficient estimates.
#############################################################################
We evaluate the regression using traditional plots.
#############################################################################
plot(density(mylm$residuals)) #residual plots
par(ask=FALSE)
plot(mylm) #plots...Shows nonlinearity / heteroskadisticy.
#NOTE: the sample size is too big for testing (always will be significant).
#############################################################################
The standardized coefficients provide an outstanding method for comparing effects.
#############################################################################
#
mybetas=suppressWarnings(lm.beta(mylm)) #use the QuantPsych library to get the betas
new=data.frame(mybetas) #place the betas in a data frame
new$seq=seq(1,length(mybetas)) #get a sequence for the number of variables
new=new[order(-mybetas),c(1,2)] #sort descending importance
new$newseq=seq(1,length(mybetas)) #put a new sequence value in place
mytext=rownames(new) #get the rownames
new=data.frame(new) #restore to data frame
print(new[1:10, ]) #print the 10 most important terms
## mybetas seq newseq
## Type_Rehabilitation 1.621737e+06 82 1
## State_ND 1.262592e+05 46 2
## State_MT 7.848638e+04 44 3
## State_UT 6.533551e+04 63 4
## State_VI 3.553403e+04 65 5
## State_FL 1.560614e+04 25 6
## Type_DoD 3.493278e+01 79 7
## Type_Long Term 2.279329e+01 80 8
## State_HI 1.493424e+01 28 9
## State_CT 9.822156e+00 22 10
#############################################################################
At the end of the day, this is what matters. How well do we predict the test set with linear regression? We run additional models in Python.
#############################################################################
mypredict = predict (mylm, test, interval="prediction") #predict on test set
mylm2=lm(test$Number_Diagnoses~mypredict[,1]) #regress actual vs. forecast
mytestsummary=summary(mylm2) #summary
print("R^2:")
## [1] "R^2:"
mytestsummary$r.squared
## [1] 0.9181246
newx <- seq(0, 12, length.out=nrow(test)) #let's set up for plotting
plot(test$Number_Diagnoses~mypredict[, 1],
main="# Diagnoses, Actual vs. Predicted, Blind Dataset w. Prediction Interval",
xlab="Prediction", ylab="Actual")
abline(lm(test$Number_Diagnoses~mypredict[,1]), col="red") #add a fit line
abline(lm(mypredict[,2]~mypredict[,1]), col="blue")
abline(lm(mypredict[,3]~mypredict[,1]), col="blue")
#############################################################################
We re-run using all the data.
#############################################################################
for (i in 1:14){
mycsv[,i]=mycsv[,i]+abs(min(mycsv[,i]))+.01 #positive definite transform
}
mynewt=powerTransform(as.matrix(mycsv[,1:14])~1) #fit all quant to identity
for (i in 1:14){
mycsv[,i]=mycsv[,i]^mynewt$lambda[i]#use lambdas to transform
}
mylmfinal=lm(Number_Diagnoses~., data=mycsv) #build full regression
mysummary=summary(mylm) #summarize
print("R^2:")
## [1] "R^2:"
mysummary$r.squared #get the R^2
## [1] 0.9235414
mypredict2 = predict (mylmfinal, mycsv, interval="prediction") #get the predictions
plot(mycsv$Number_Diagnoses~mypredict2[, 1],
main="# Diagnoses, Actual vs. Predicted, Full Datas w. Prediction Interval",
xlab="Prediction", ylab="Actual")
abline(lm(mycsv$Number_Diagnoses~mypredict2[,1]), col="red") #add a fit line
abline(lm(mypredict2[,2]~mypredict2[,1]), col="blue")
abline(lm(mypredict2[,3]~mypredict2[,1]), col="blue")
#############################################################################