Introduction

This analysis forecasts the number of diagnoses and admissions for heart failure.

Three units of analysis (UA): hospital, state, county Two dependent variables: number of diagnoses (hospital UA), admission rate (state & county)

Methods: OLS, Lasso, enet, RF, extra trees, gradient boosting, bagging, hospital UA OLS, Lasso, enet, GIS for admission rates at state and county UA.

Software: Python 3.8 R Version 4.0.2 R Studio IDE

Setup

Load the printing functions and all libraries

#############################################################################
knitr::opts_chunk$set(echo = TRUE)    
library(Amelia)

## Warning: package 'Amelia' was built under R version 4.0.5

## Warning: package 'Rcpp' was built under R version 4.0.5

library(car)

## Warning: package 'car' was built under R version 4.0.5

library(caret)

## Warning: package 'caret' was built under R version 4.0.5

## Warning: package 'ggplot2' was built under R version 4.0.5

library(clusterGeneration)

## Warning: package 'clusterGeneration' was built under R version 4.0.5

library(corrplot)
#library(covTest)
library(elsa)

## Warning: package 'elsa' was built under R version 4.0.5

## Warning: package 'sp' was built under R version 4.0.5

## Warning: package 'raster' was built under R version 4.0.5

## Warning: namespace 'terra' is not available and has been replaced
## by .GlobalEnv when processing object '<unknown>'

## Warning: namespace 'terra' is not available and has been replaced
## by .GlobalEnv when processing object '<unknown>'

## Warning: namespace 'terra' is not available and has been replaced
## by .GlobalEnv when processing object '<unknown>'

## Warning: namespace 'terra' is not available and has been replaced
## by .GlobalEnv when processing object '<unknown>'

library(factoextra)

## Warning: package 'factoextra' was built under R version 4.0.5

library(fpp2)

## Warning: package 'fpp2' was built under R version 4.0.5

## Warning: package 'forecast' was built under R version 4.0.5

## Warning: package 'fma' was built under R version 4.0.5

## Warning: package 'expsmooth' was built under R version 4.0.5

library(ggcorrplot)

## Warning: package 'ggcorrplot' was built under R version 4.0.5

library(ggExtra)

## Warning: package 'ggExtra' was built under R version 4.0.5

library(ggplot2)
library(glmnet)

## Warning: package 'glmnet' was built under R version 4.0.5

library(glmpath)

## Warning: package 'glmpath' was built under R version 4.0.5

library(grid)        
library(gridExtra)

## Warning: package 'gridExtra' was built under R version 4.0.5

library(hts)

## Warning: package 'hts' was built under R version 4.0.5

library(kableExtra)

## Warning: package 'kableExtra' was built under R version 4.0.5

library(knitr)

## Warning: package 'knitr' was built under R version 4.0.5

library(lars)        
library(lattice)
library(latticeExtra)

## Warning: package 'latticeExtra' was built under R version 4.0.5

library(leaflet)

## Warning: package 'leaflet' was built under R version 4.0.5

library(leaps)

## Warning: package 'leaps' was built under R version 4.0.5

#library(lmSupport)   
library(maptools)

## Warning: package 'maptools' was built under R version 4.0.5

library(MASS)
library(mice)

## Warning: package 'mice' was built under R version 4.0.5

library(NbClust)
library(psych)

## Warning: package 'psych' was built under R version 4.0.5

library(QuantPsyc)

## Warning: package 'QuantPsyc' was built under R version 4.0.5

## Warning: package 'boot' was built under R version 4.0.5

library(raster)      
library(RColorBrewer)
library(ResourceSelection)

## Warning: package 'ResourceSelection' was built under R version 4.0.5

library(reticulate)

## Warning: package 'reticulate' was built under R version 4.0.5

library(rgdal)       
library(rgeos)

## Warning: package 'rgeos' was built under R version 4.0.5

library(scales)

## Warning: package 'scales' was built under R version 4.0.5

library(sf)

## Warning: package 'sf' was built under R version 4.0.5

library(shiny)

## Warning: package 'shiny' was built under R version 4.0.5

library(sp)          
library(spatialEco)

## Warning: package 'spatialEco' was built under R version 4.0.5

library(spatialreg)

## Warning: package 'spatialreg' was built under R version 4.0.5

## Warning: package 'spData' was built under R version 4.0.5

## Warning: namespace 'terra' is not available and has been replaced
## by .GlobalEnv when processing object '<unknown>'

## Warning: namespace 'terra' is not available and has been replaced
## by .GlobalEnv when processing object '<unknown>'

library(spData)      
library(spdep)

## Warning: package 'spdep' was built under R version 4.0.5

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.0.5

## Warning: package 'tibble' was built under R version 4.0.5

## Warning: package 'tidyr' was built under R version 4.0.5

## Warning: package 'readr' was built under R version 4.0.5

## Warning: package 'purrr' was built under R version 4.0.5

## Warning: package 'dplyr' was built under R version 4.0.5

## Warning: package 'stringr' was built under R version 4.0.5

## Warning: package 'forcats' was built under R version 4.0.5

library(totalcensus)

## Warning: package 'totalcensus' was built under R version 4.0.5

library(tmap)

## Warning: package 'tmap' was built under R version 4.0.5

library(tmaptools)

## Warning: package 'tmaptools' was built under R version 4.0.5

library(usmap)

## Warning: package 'usmap' was built under R version 4.0.5

library(viridis)

## Warning: package 'viridis' was built under R version 4.0.5

## Warning: package 'viridisLite' was built under R version 4.0.5

library(xgboost)

## Warning: package 'xgboost' was built under R version 4.0.5

#############################################################################
print("Set up >50 R Libraries...")

## [1] "Set up >50 R Libraries..."

Environment Set Up

#############################################################################
Sys.setenv('RETICULATE_PYTHON'='C://Users//tf')
use_python('C://Users')
use_condaenv("tf") 
py$RANDOM_SEED = 1234
matplotlib=import('matplotlib.pyplot')
options(scipen=9); options(digits=3)
#############################################################################
print("Invoked Python ...")

## [1] "Invoked Python ..."

Functions

################################PRINT########################################
myprint=function(x,nm) {
  x%>%kbl(col.names = nm)%>%kable_classic(full_width = F, html_font = "Cambria")}
#############################################################################

##############################COREELATION####################################
corfunction=function(d){
  mycorr=cor(d[, 1:ncol(d)]); p.mat=cor_pmat(d[,1:ncol(d)])
  myplot=ggcorrplot(mycorr, hc.order=TRUE,type="lower",colors=c("red", "white","green"),tl.cex = 8, tl.col = "black", lab=TRUE, lab_size=3, p.mat=p.mat, insig="pch", pch=4)
  print(myplot)}
#############################################################################

#################################SCALE#######################################
myscale=function(x) {(x-min(x))/(max(x)-min(x))}
#############################################################################

print("Set up functions for use...")

## [1] "Set up functions for use..."

Read the Data

#############################################################################
mycsv=read.csv("D:/publications/Heart/thf5.csv", stringsAsFactors = TRUE)
#############################################################################

print("Read the data...")

## [1] "Read the data..."

Delete Rows

#############################################################################
missmap(mycsv)

mycsv$na_count = apply(mycsv, 1, function(x) sum(is.na(x))) #count NA's
mycsv=mycsv[mycsv$na_count<5,] #delete columns with 5 or more NAs (33%)
mycsv$na_count=NULL #remove count variable
#############################################################################

print("Deleting largely empty rows..")

## [1] "Deleting largely empty rows.."

Completeness by Column

#############################################################################
myprint(sort(apply(mycsv,2,function(x) sum(is.na(x))/length(x))), "Missing")

	Missing
Diagnoses	0.000
Equity	0.000
Ownership_	0.000
Medical_School_Affiliation_	0.000
Hospital_Type_	0.000
State_	0.000
Urban_Rural_	0.000
Year_	0.000
DRG_	0.000
Discharges	0.000
Medicare	0.000
Net_Income	0.001
Employees	0.002
Affiliated_Physicians	0.010
Profit_Margin	0.010
Staffed_Beds	0.017
Cash_on_Hand	0.039
ER_Visits	0.067
Medicaid	0.084
Surgeries	0.101

#############################################################################

Train and Test Set

#############################################################################
set.seed(1234) #set a pseudo-random number
mys=sort(sample(1:nrow(mycsv),.2*nrow(mycsv), replace=FALSE))#sample 20%
train=mycsv[-mys,]; test=mycsv[mys,] #test set is that 20%
#############################################################################
print("Make training and test sets for model building...")

## [1] "Make training and test sets for model building..."

Impute the Missing

#############################################################################
for(i in 1:13){ #replace 1% missing with means
  mycsv[is.na(mycsv[,i]), i] <- mean(mycsv[,i], na.rm = TRUE) 
  train[is.na(train[,i]), i] <- mean(train[,i], na.rm = TRUE) 
  test[is.na(test[,i]), i] <- mean(test[,i], na.rm = TRUE)    }
missmap(mycsv)

#############################################################################

Describe the Data

We provide descriptive statistics by year and overall using the total dataset.

#############################################################################
mydescribe=function(x,y){describe(x)%>%kbl(caption=y)%>%kable_classic(full_width = F, html_font = "Cambria")}
mydescribe(mycsv[mycsv$Year_=='Y16',],'Year 2016')

Year 2016
	vars	n	mean	sd	median	trimmed	mad	min	max	range	skew	kurtosis	se
Diagnoses	1	13534	1474.421	2572.021	423.000	869.809	547.079	11.0	39576.000	39565.000	3.598	19.609	22.109
Net_Income	2	13534	17159592.218	118788532.052	1959540.000	6906718.464	7687041.560	-1214338654.0	3309549602.000	4523888256.000	11.516	259.295	1021083.902
Profit_Margin	3	13534	-0.031	1.253	-0.018	-0.020	0.128	-15.5	62.071	77.525	37.882	1792.513	0.011
Cash_on_Hand	4	13534	20182280.996	119241829.018	1913427.000	5666635.050	2868853.239	-2512769971.0	3880477236.000	6393247207.000	10.509	373.462	1024980.358
Equity	5	13534	173861922.263	628434725.437	33863550.000	75970084.376	51538597.100	-3248783235.0	10242977000.000	13491760235.000	11.384	166.724	5401906.823
Discharges	6	13534	6963.759	9883.288	2743.000	4922.636	3664.987	1.0	129339.000	129338.000	3.039	16.466	84.955
ER_Visits	7	13534	32723.756	33875.156	24969.000	27184.606	24502.930	0.0	543457.000	543457.000	3.027	21.221	291.184
Surgeries	8	13534	6323.656	7934.277	4464.500	4847.949	4333.640	0.0	130741.000	130741.000	4.538	40.539	68.202
Staffed_Beds	9	13534	146.009	171.736	87.000	112.371	91.921	2.0	2753.000	2751.000	3.046	19.535	1.476
Affiliated_Physicians	10	13534	230.402	351.820	104.000	155.111	130.469	1.0	4328.000	4327.000	3.859	23.763	3.024
Employees	11	13534	998.841	1669.983	431.000	644.058	475.915	13.0	26491.000	26478.000	5.193	44.359	14.355
Medicare	12	13534	0.449	0.187	0.423	0.440	0.182	0.0	0.983	0.983	0.404	-0.296	0.002
Medicaid	13	13534	0.087	0.091	0.063	0.071	0.056	0.0	0.869	0.869	2.610	10.601	0.001
Ownership_*	14	13534	2.377	0.793	3.000	2.471	0.000	1.0	3.000	2.000	-0.776	-0.980	0.007
Medical_School_Affiliation_*	15	13534	3.549	0.840	4.000	3.715	0.000	1.0	5.000	4.000	-1.553	1.158	0.007
Hospital_Type_*	16	13534	5.583	2.164	7.000	5.855	0.000	1.0	7.000	6.000	-0.975	-0.961	0.019
State_*	17	13534	25.884	14.622	25.000	25.941	19.274	1.0	51.000	50.000	-0.014	-1.304	0.126
Urban_Rural_*	18	13534	1.543	0.498	2.000	1.554	0.000	1.0	2.000	1.000	-0.172	-1.970	0.004
Year_*	19	13534	1.000	0.000	1.000	1.000	0.000	1.0	1.000	0.000	NaN	NaN	0.000
DRG_*	20	13534	1.982	0.809	2.000	1.977	1.483	1.0	3.000	2.000	0.033	-1.471	0.007

mydescribe(mycsv[mycsv$Year_=='Y17',], 'Year 2017')

Year 2017
	vars	n	mean	sd	median	trimmed	mad	min	max	range	skew	kurtosis	se
Diagnoses	1	13434	1685.886	3533.270	376.000	816.951	475.915	11.000	53822.000	53811.000	4.377	27.642	30.484
Net_Income	2	13434	17454389.672	117717320.828	2055648.000	7080657.720	7815103.359	-1214338654.000	3309549602.000	4523888256.000	11.679	266.875	1015635.090
Profit_Margin	3	13434	-0.026	1.247	-0.017	-0.019	0.126	-15.454	62.071	77.525	38.881	1839.785	0.011
Cash_on_Hand	4	13434	20255460.279	120964160.745	1952651.000	5726982.511	2928361.838	-2512769971.000	3880477236.000	6393247207.000	10.090	359.031	1043647.999
Equity	5	13434	174351760.311	619985872.380	34675003.500	76916073.119	52488620.493	-3248783235.000	10242977000.000	13491760235.000	11.328	167.239	5349080.347
Discharges	6	13434	7012.265	9904.395	2792.500	4972.690	3710.206	4.000	129339.000	129335.000	3.029	16.381	85.453
ER_Visits	7	13434	32857.654	34004.995	24969.000	27300.944	24508.861	0.000	543457.000	543457.000	3.022	21.081	293.386
Surgeries	8	13434	6351.913	8014.677	4449.000	4857.215	4346.983	0.000	130741.000	130741.000	4.530	39.954	69.149
Staffed_Beds	9	13434	146.631	172.356	87.000	112.873	91.921	3.000	2753.000	2750.000	3.030	19.338	1.487
Affiliated_Physicians	10	13434	231.616	353.665	104.000	155.710	130.469	1.000	4328.000	4327.000	3.835	23.417	3.051
Employees	11	13434	1008.742	1683.827	436.000	651.263	481.845	4.000	26491.000	26487.000	5.198	44.263	14.528
Medicare	12	13434	0.448	0.186	0.422	0.439	0.181	0.002	0.983	0.981	0.411	-0.279	0.002
Medicaid	13	13434	0.087	0.091	0.063	0.071	0.056	0.000	0.869	0.869	2.613	10.687	0.001
Ownership_*	14	13434	2.378	0.793	3.000	2.473	0.000	1.000	3.000	2.000	-0.781	-0.975	0.007
Medical_School_Affiliation_*	15	13434	3.544	0.843	4.000	3.710	0.000	1.000	5.000	4.000	-1.536	1.101	0.007
Hospital_Type_*	16	13434	5.588	2.164	7.000	5.861	0.000	1.000	7.000	6.000	-0.981	-0.952	0.019
State_*	17	13434	25.868	14.609	25.000	25.916	19.274	1.000	51.000	50.000	-0.010	-1.302	0.126
Urban_Rural_*	18	13434	1.544	0.498	2.000	1.555	0.000	1.000	2.000	1.000	-0.178	-1.969	0.004
Year_*	19	13434	2.000	0.000	2.000	2.000	0.000	2.000	2.000	0.000	NaN	NaN	0.000
DRG_*	20	13434	1.971	0.809	2.000	1.964	1.483	1.000	3.000	2.000	0.053	-1.470	0.007

mydescribe(mycsv[mycsv$Year_=='Y18',], 'Year 2018')

Year 2018
	vars	n	mean	sd	median	trimmed	mad	min	max	range	skew	kurtosis	se
Diagnoses	1	13289	1763.025	3780.335	358.000	825.385	453.676	11.000	57461.000	57450.000	4.400	27.846	32.793
Net_Income	2	13289	17249021.440	117586625.569	2071404.000	7115267.682	7737194.212	-1214338654.000	3309549602.000	4523888256.000	11.705	270.436	1020027.256
Profit_Margin	3	13289	-0.023	1.248	-0.016	-0.018	0.126	-15.454	62.071	77.525	39.344	1854.591	0.011
Cash_on_Hand	4	13289	20469711.526	121724272.987	1999330.000	5809937.758	2991769.675	-2512769971.000	3880477236.000	6393247207.000	10.017	353.916	1055920.055
Equity	5	13289	177575765.876	634571647.806	35038130.000	78143024.189	52950626.081	-3248783235.000	10242977000.000	13491760235.000	11.255	163.063	5504710.877
Discharges	6	13289	7067.705	9937.316	2867.000	5028.271	3822.143	2.000	129339.000	129337.000	3.016	16.264	86.203
ER_Visits	7	13289	33014.752	34051.638	25326.000	27456.406	24815.759	0.000	543457.000	543457.000	3.019	21.065	295.387
Surgeries	8	13289	6372.826	8013.795	4480.000	4880.009	4361.809	0.000	130741.000	130741.000	4.517	39.923	69.517
Staffed_Beds	9	13289	146.889	173.335	86.000	112.901	90.439	3.000	2753.000	2750.000	3.026	19.191	1.504
Affiliated_Physicians	10	13289	233.367	354.940	104.000	157.278	130.469	1.000	4328.000	4327.000	3.819	23.236	3.079
Employees	11	13289	1016.679	1698.380	440.000	656.473	486.293	12.000	26491.000	26479.000	5.181	43.660	14.733
Medicare	12	13289	0.448	0.186	0.421	0.439	0.181	0.002	0.983	0.981	0.426	-0.282	0.002
Medicaid	13	13289	0.087	0.090	0.062	0.070	0.055	0.000	0.869	0.869	2.570	10.324	0.001
Ownership_*	14	13289	2.379	0.795	3.000	2.474	0.000	1.000	3.000	2.000	-0.784	-0.977	0.007
Medical_School_Affiliation_*	15	13289	3.538	0.844	4.000	3.704	0.000	1.000	5.000	4.000	-1.528	1.054	0.007
Hospital_Type_*	16	13289	5.599	2.162	7.000	5.875	0.000	1.000	7.000	6.000	-0.995	-0.929	0.019
State_*	17	13289	25.763	14.606	25.000	25.789	19.274	1.000	51.000	50.000	0.000	-1.301	0.127
Urban_Rural_*	18	13289	1.544	0.498	2.000	1.555	0.000	1.000	2.000	1.000	-0.176	-1.969	0.004
Year_*	19	13289	3.000	0.000	3.000	3.000	0.000	3.000	3.000	0.000	NaN	NaN	0.000
DRG_*	20	13289	1.962	0.809	2.000	1.952	1.483	1.000	3.000	2.000	0.070	-1.468	0.007

mydescribe(mycsv, 'All Years')

All Years
	vars	n	mean	sd	median	trimmed	mad	min	max	range	skew	kurtosis	se
Diagnoses	1	40257	1640.258	3334.942	385.000	835.737	492.223	11.0	57461.000	57450.000	4.440	29.687	16.621
Net_Income	2	40257	17287488.831	118032673.517	2027491.000	7032593.371	7729250.441	-1214338654.0	3309549602.000	4523888256.000	11.633	265.467	588276.555
Profit_Margin	3	40257	-0.027	1.249	-0.017	-0.019	0.126	-15.5	62.071	77.525	38.697	1828.754	0.006
Cash_on_Hand	4	40257	20301583.348	120637587.895	1949345.000	5733344.451	2927095.697	-2512769971.0	3880477236.000	6393247207.000	10.203	362.054	601259.486
Equity	5	40257	175251339.063	627656053.449	34648343.000	76993362.623	52488269.858	-3248783235.0	10242977000.000	13491760235.000	11.324	165.703	3128246.865
Discharges	6	40257	7014.259	9908.036	2811.000	4973.915	3742.082	1.0	129339.000	129338.000	3.028	16.372	49.382
ER_Visits	7	40257	32864.497	33976.188	25085.000	27312.390	24528.134	0.0	543457.000	543457.000	3.023	21.125	169.338
Surgeries	8	40257	6349.317	7987.273	4464.000	4861.438	4345.501	0.0	130741.000	130741.000	4.529	40.143	39.809
Staffed_Beds	9	40257	146.507	172.468	86.000	112.708	90.439	2.0	2753.000	2751.000	3.034	19.357	0.860
Affiliated_Physicians	10	40257	231.786	353.461	104.000	156.015	130.469	1.0	4328.000	4327.000	3.838	23.474	1.762
Employees	11	40257	1008.034	1683.991	436.000	650.519	481.845	4.0	26491.000	26487.000	5.191	44.102	8.393
Medicare	12	40257	0.448	0.186	0.422	0.439	0.181	0.0	0.983	0.983	0.414	-0.285	0.001
Medicaid	13	40257	0.087	0.091	0.063	0.071	0.056	0.0	0.869	0.869	2.598	10.541	0.000
Ownership_*	14	40257	2.378	0.793	3.000	2.472	0.000	1.0	3.000	2.000	-0.780	-0.977	0.004
Medical_School_Affiliation_*	15	40257	3.544	0.842	4.000	3.710	0.000	1.0	5.000	4.000	-1.539	1.105	0.004
Hospital_Type_*	16	40257	5.590	2.163	7.000	5.863	0.000	1.0	7.000	6.000	-0.984	-0.947	0.011
State_*	17	40257	25.839	14.612	25.000	25.882	19.274	1.0	51.000	50.000	-0.008	-1.302	0.073
Urban_Rural_*	18	40257	1.544	0.498	2.000	1.555	0.000	1.0	2.000	1.000	-0.175	-1.969	0.002
Year_*	19	40257	1.994	0.816	2.000	1.992	1.483	1.0	3.000	2.000	0.011	-1.499	0.004
DRG_*	20	40257	1.971	0.809	2.000	1.964	1.483	1.0	3.000	2.000	0.052	-1.470	0.004

#############################################################################

Graphs

Boxplots for Affilation Data

m=10000
myp1=ggplot(mycsv[mycsv$Year_=="Y16",], aes(x=reorder(Medical_School_Affiliation_,Diagnoses, median), y=Diagnoses, color=Medical_School_Affiliation_))+
  geom_boxplot(notch=TRUE)+
  theme(legend.position = "none")+
  xlab("")+
  coord_flip()+
  scale_y_continuous(name="Year 2016", labels = comma, limits=c(0,m))

myp2=ggplot(mycsv[mycsv$Year_=="Y17",], aes(x=reorder(Medical_School_Affiliation_,Diagnoses, median), y=Diagnoses, color=Medical_School_Affiliation_))+
  geom_boxplot(notch=TRUE)+
  theme(legend.position = "none")+
  xlab("")+
  coord_flip()+
  scale_y_continuous(name="Year 2017", labels = comma, limits=c(0,m))

myp3=ggplot(mycsv[mycsv$Year_=="Y18",], aes(x=reorder(Medical_School_Affiliation_,Diagnoses, median), y=Diagnoses, color=Medical_School_Affiliation_))+
  geom_boxplot(notch=TRUE)+
  theme(legend.position = "none")+
  xlab("")+
  coord_flip()+
  scale_y_continuous(name="Year 2018", labels = comma, limits=c(0,m))

grid.arrange(myp1,myp2,myp3)

## Warning: Removed 259 rows containing non-finite values (stat_boxplot).

## Warning: Removed 518 rows containing non-finite values (stat_boxplot).

## notch went outside hinges. Try setting notch=FALSE.

## Warning: Removed 566 rows containing non-finite values (stat_boxplot).

## notch went outside hinges. Try setting notch=FALSE.

Boxplots for Type Data

m=10000
myp1=ggplot(mycsv[mycsv$Year_=="Y16",], aes(x=reorder(Ownership_,Diagnoses, median), y=Diagnoses, color=Ownership_))+
  geom_boxplot(notch=TRUE)+
  theme(legend.position = "none")+
  xlab("")+
  coord_flip()+
  scale_y_continuous(name="Year 2016", labels = comma, limits=c(0,m))

myp2=ggplot(mycsv[mycsv$Year_=="Y17",], aes(x=reorder(Ownership_,Diagnoses, median), y=Diagnoses, color=Ownership_))+
  geom_boxplot(notch=TRUE)+
  theme(legend.position = "none")+
  xlab("")+
  coord_flip()+
  scale_y_continuous(name="Year 2017", labels = comma, limits=c(0,m))

myp3=ggplot(mycsv[mycsv$Year_=="Y18",], aes(x=reorder(Ownership_,Diagnoses, median), y=Diagnoses, color=Ownership_))+
  geom_boxplot(notch=TRUE)+
  theme(legend.position = "none")+
  xlab("")+
  coord_flip()+
  scale_y_continuous(name="Year 2018", labels = comma, limits=c(0,m))

grid.arrange(myp1,myp2,myp3)

## Warning: Removed 259 rows containing non-finite values (stat_boxplot).

## Warning: Removed 518 rows containing non-finite values (stat_boxplot).

## Warning: Removed 566 rows containing non-finite values (stat_boxplot).

Barplots

#############################################################################
mynames=names(mycsv) #get the names for the data
for (i in 14:20){
  p = ggplot(data = mycsv, aes(x = mycsv[,i], fill=Year_))+   geom_bar(stat='count')+facet_grid(~Year_, scale='free_x')+ ggtitle(mynames[i])+ xlab('')+ylab('Count')+ coord_flip() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
  print(p)
  barplot(sort(table(mycsv[, i]), decreasing = FALSE), main=mynames[i], col="red", horiz=TRUE, las=2, cex.names=.5)
}

#############################################################################

Correlations Pre-Transform

We investigate hierarchically clustered correlation plots

#############################################################################
corfunction(mycsv[, 1:13])

#############################################################################

Generate Principal Components

#############################################################################
totalpc=prcomp(mycsv[, 6:11], center=TRUE, scale=TRUE, retx=TRUE)
trainpc=prcomp(train[, 6:11], center=TRUE, scale=TRUE, retx=TRUE)
checkvar=data.frame('Total PC'=totalpc$sdev^2/sum(totalpc$sdev^2), 'TrainPC'= trainpc$sdev^2/sum(trainpc$sdev^2))
row.names(checkvar)=c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6")
colnames(checkvar)=c("%Var, Total Data", "%Var, Training Dat")
myprint(checkvar,colnames(checkvar))

	%Var, Total Data	%Var, Training Dat
PC1	0.848	0.849
PC2	0.065	0.065
PC3	0.034	0.033
PC4	0.028	0.028
PC5	0.016	0.016
PC6	0.010	0.010

mycsv$PC1=totalpc$x[,1]; train$PC1=trainpc$x[,1]
test$PC1=predict(trainpc, newdata=test[,6:11])[,1] #Predict Test with Training
mycsv[,6:11]=train[,6:11]=test[,6:11]=NULL
#############################################################################

Re-order & Re-check Correlations

#############################################################################
corfunction(mycsv[, 1:7,15])

mycsv=mycsv[,c(1,15,2:14)] 
train=train[,c(1,15,2:14)]
test=test[,c(1,15,2:14)]
#############################################################################

Scale for Analysis

#############################################################################
for (i in c(1:8)){mycsv[,i]=myscale(mycsv[,i])
train[,i]=myscale(train[,i]) #update training set
test[,i]=myscale(test[,i])} #update test set
#############################################################################
describe(mycsv[, 1:8])%>%kbl()%>%kable_classic(full_width = F, html_font = "Cambria")

	vars	n	mean	sd	median	trimmed	mad	max	range	skew	kurtosis	se
Diagnoses	1	40257	0.028	0.058	0.007	0.014	0.009	1	1	4.440	29.687	0.000
PC1	2	40257	0.941	0.074	0.968	0.956	0.034	1	1	-3.269	19.291	0.000
Net_Income	3	40257	0.272	0.026	0.269	0.270	0.002	1	1	11.633	265.467	0.000
Profit_Margin	4	40257	0.199	0.016	0.199	0.199	0.002	1	1	38.697	1828.754	0.000
Cash_on_Hand	5	40257	0.396	0.019	0.393	0.394	0.000	1	1	10.203	362.054	0.000
Equity	6	40257	0.254	0.047	0.243	0.247	0.004	1	1	11.324	165.703	0.000
Medicare	7	40257	0.456	0.189	0.429	0.447	0.184	1	1	0.414	-0.285	0.001
Medicaid	8	40257	0.100	0.104	0.072	0.081	0.065	1	1	2.598	10.541	0.001

Python Libraries


#############################################################################
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import gc
import sklearn.linear_model   #Need the linear model components
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor as RFR # Random Forest package
from sklearn.ensemble import ExtraTreesRegressor as ETR # Extra Trees package
from sklearn.ensemble import BaggingRegressor as BR 
from sklearn.model_selection import train_test_split as tts
#from sklearn.model_selection import RandomizedSearchCV as RSCV
from xgboost import XGBRegressor as XGB
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet 
from sklearn.linear_model import LinearRegression as OLS
from sklearn.metrics import mean_squared_error, r2_score # evaluation metrics
#############################################################################
print("Loaded 20 or so Python libraries...")

## Loaded 20 or so Python libraries...

Read Data for Python


#############################################################################
mydata, train, test = r.mycsv, r.train,r.test
mydata, train, test=pd.get_dummies(mydata), pd.get_dummies(train), pd.get_dummies(test)
mynames=mydata.columns
xtrain, ytrain, xtest, ytest =train.iloc[:, 1:len(train.columns)], train.iloc[:,0], test.iloc[:, 1:len(test.columns)], test.iloc[:,0]
xtot, ytot=mydata.iloc[:, 1:len(mydata.columns)], mydata.iloc[:,]
#############################################################################
print("Read in R full, train, test data...")

## Read in R full, train, test data...

Initial ML Models


t1=OLS(n_jobs=-1)
t2=RFR(n_estimators = 50, criterion='mse',n_jobs = -1, random_state = 1234)
t3=ETR(n_estimators = 50,criterion='mse', n_jobs = -1, random_state = 1234)
t4=XGB(n_estimators=50, n_jobs=-1,learning_rate=0.1, max_depth=15,objective='reg:squarederror', random_state=1234) 
t5=BR(n_estimators=50, random_state=1234)

def myf(x,name):
    temp=x.fit(xtrain,ytrain)
    print(name,": ", round(temp.score(xtest, ytest),4))
    
myf(t1, 'OLS')

## OLS :  0.4372

myf(t2, 'RFR')

## RFR :  0.829

myf(t3, 'ETR')

## ETR :  0.8658

myf(t4, 'XGB')

## XGB :  0.8155
## 
## C:\Users\tf\lib\site-packages\xgboost\data.py:114: UserWarning: Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption
##   "because it will generate extra copies and increase " +

myf(t5, 'BR')

#############################################################################

## BR :  0.8285

Prediction with Best Model Post-Tuning


#############################################################################
model=ETR(n_estimators = 200, max_depth=70,n_jobs=-1, random_state=1234)  
modelfit=model.fit(xtrain, ytrain)
mypred=model.predict(xtest)
mynames=mydata.columns
print(modelfit.score(xtest,ytest))

## 0.8644536708445771

tempdata={'Variable':mynames[1:len(mynames)],'Importance':modelfit.feature_importances_}
#############################################################################

Descending in Order for Importance


#############################################################################
a=pd.DataFrame(modelfit.feature_importances_)
a.index, a.columns=mynames[1:len(mynames)],["Importance"]
a.sort_values(by=['Importance'], ascending=False,inplace=True)
#############################################################################
print("Sort variables in order of importance...")

## Sort variables in order of importance...

Barplot of Importances

#############################################################################
imp=reticulate::py$a$Importance
impnames=factor(c("STAC", "PC", "DRG293", "UT", "LTAC"), levels=c("STAC", "PC", "DRG293", "UT", "LTAC"))
tempdf=data.frame(Name=impnames[1:5], Importance=imp[1:5])

ggplot(tempdf, aes(x=Name, y=Importance, fill=Name)) +   geom_bar(stat = "identity") + theme(legend.position = "none",axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),axis.title.x = element_blank())+ geom_text(aes(label=round(Importance,3)), position=position_dodge(width=0.9), vjust=-0.25)

#############################################################################

Regression Models, Hospital Unit of Analysis

Next, we build a train regression and evaluate on test set.

#############################################################################
fullOLS=lm(Diagnoses~., data=train)
mylm=summary(fullOLS); myres=residuals(fullOLS); mypred=predict(fullOLS, test)
testfit=lm(test$Diagnoses~mypred); mysum=summary(testfit)
ans=c(mylm$r.squared, mysum$r.squared);names(ans)=c("Train", "Test")
ans%>%kbl(col.names="R2")%>%kable_classic(full_width = F, html_font = "Cambria")

	R2
Train	0.501
Test	0.454

#############################################################################

Regression Residual Plot

#############################################################################
layout(mat = matrix(c(1,2),2,1, byrow=TRUE),  height = c(1,8))
par(mar=c(0, 3.1, 1.1, 2.1))
boxplot(myres, horizontal=TRUE, main="", xaxt="n", col="red")
par(mar=c(4, 3.1, 1.1, 2.1))
hist(myres, main="", xlab="", col="blue")

#############################################################################

Complete Regression Model on Entire Dataset

#############################################################################
mylmfinal=lm(Diagnoses~., data=mycsv) #build full regression
mysummary=summary(mylmfinal)
ans=c(mylm$r.squared, mysum$r.squared, mysummary$r.squared); names(ans)=c("Train", "Test", "Full")
ans%>%kbl(col.names="R2")%>%kable_classic(full_width = F, html_font = "Cambria")

	R2
Train	0.501
Test	0.454
Full	0.492

summary(aov(mylmfinal))

##                                Df Sum Sq Mean Sq  F value  Pr(>F)    
## PC1                             1   41.0    41.0 23898.59 < 2e-16 ***
## Net_Income                      1    0.1     0.1    41.00 1.5e-10 ***
## Profit_Margin                   1    0.0     0.0     5.65   0.017 *  
## Cash_on_Hand                    1    0.1     0.1    57.11 4.2e-14 ***
## Equity                          1    0.0     0.0    18.86 1.4e-05 ***
## Medicare                        1    0.1     0.1    47.38 5.9e-12 ***
## Medicaid                        1    0.1     0.1    73.65 < 2e-16 ***
## Ownership_                      2    0.4     0.2   120.24 < 2e-16 ***
## Medical_School_Affiliation_     4    0.4     0.1    52.12 < 2e-16 ***
## Hospital_Type_                  5    1.5     0.3   173.42 < 2e-16 ***
## State_                         50    0.9     0.0    10.75 < 2e-16 ***
## Urban_Rural_                    1    0.1     0.1    41.45 1.2e-10 ***
## Year_                           2    0.2     0.1    45.97 < 2e-16 ***
## DRG_                            2   22.0    11.0  6423.70 < 2e-16 ***
## Residuals                   40183   68.9     0.0                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#############################################################################

Function for R2

#############################################################################
myr2=function(pred, actual){
  rss =sum((pred - actual) ^ 2); tss = sum((actual - mean(actual)) ^ 2)
  rsq = 1 - rss/tss; names(rsq)="R2"; return(rsq) }
#############################################################################

Evaluate LASSO

#############################################################################
lambda_seq=10^seq(2, -2, by = -.1)

#Train Data
x=model.matrix(Diagnoses~. , train)[,-1]; y=train$Diagnoses
#Test Data
x2=model.matrix(Diagnoses~., test)[,-1]; y2=test$Diagnoses
#Total Data
x3=model.matrix(Diagnoses~., mycsv)[,-1]; y3=mycsv$Diagnoses
mynames=colnames(x3)

#Best Lambda
best_lam=cv.glmnet(x,y, alpha = 1, lambda = lambda_seq)$lambda.min
names(best_lam)="Best Lambda"

#Model and Predictions
lasso=glmnet(x,y,alpha = 1, lambda = best_lam)
pred1=predict(lasso, s = best_lam, newx=x)
pred2=predict(lasso, s = best_lam, newx=x2)
pred3=predict(lasso, s=best_lam, newx=x3)
t1=myr2(pred1,y); t2=myr2(pred2,y2); t3=myr2(pred3,y3);t4=best_lam
ans=c(t1,t2,t3,t4);names(ans)=c("Train", "Test", "Full", "Best Lambda")
ans%>%kbl(col.names="R2")%>%kable_classic(full_width = F, html_font = "Cambria")

	R2
Train	0.328
Test	0.263
Full	0.323
Best Lambda	0.010

lasso=glmnet(x3,y3, alpha=1, lambda=.01)
#############################################################################

Evaluate Elasticnet

#############################################################################
myenet=cv.glmnet(x,y,alpha = .5, lambda = lambda_seq, relaxed=TRUE)
best_lam=myenet$lambda.min
names(best_lam)="Best Lambda"
enet=glmnet(x,y, alpha = .5, lambda = best_lam, gamma=best_gam)
pred3 <- predict(enet, s = best_lam, newx = x)
pred4 <- predict(enet, s = best_lam, newx = x2)
pred5 <- predict(enet, s = best_lam, newx = x3)
t1=myr2(pred3,y); t2=myr2(pred4,y2); t3=myr2(pred5,y3); t4=best_lam
ans=c(t1,t2,t3,t4);names(ans)=c("Train", "Test", "Full", "Best Lambda")
ans%>%kbl(col.names="R2")%>%kable_classic(full_width = F, html_font = "Cambria")

	R2
Train	0.417
Test	0.348
Full	0.412
Best Lambda	0.010

enet=glmnet(x3,y3, alpha = .5, lambda = best_lam)
#############################################################################

Combine Coefficient Estimates

#############################################################################
LM=mysummary$coefficients[,1]; LM=c(LM[2:15],0, LM[16:74])
PValueLM=mysummary$coefficients[,4]
LASSO=as.vector(lasso$beta); ENET=as.vector(enet$beta)
TOT=cbind(round(LM,3), round(LASSO,3), round(ENET, 3), round(PValueLM, 3))
mydf1=as.data.frame(TOT)
rownames(mydf1)=mynames
colnames(mydf1)=c("OLS for p<.05","LASSO", "ENET", "P-Value OLS")
mydf1%>% kbl()%>%kable_classic(full_width = F, html_font = "Cambria")

	OLS for p<.05	LASSO	ENET	P-Value OLS
PC1	-0.439	-0.298	-0.323	0.000
Net_Income	-0.043	0.000	0.000	0.000
Profit_Margin	0.026	0.000	0.000	0.000
Cash_on_Hand	-0.082	0.000	0.000	0.048
Equity	0.012	0.000	0.000	0.000
Medicare	0.029	0.000	0.000	0.025
Medicaid	-0.002	0.000	0.000	0.000
Ownership_Proprietary	0.003	0.000	0.000	0.419
Ownership_Voluntary Nonprofit	0.006	0.000	0.000	0.000
Medical_School_Affiliation_Limited	0.002	0.000	0.000	0.000
Medical_School_Affiliation_Major	-0.009	0.000	0.000	0.066
Medical_School_Affiliation_No Affiliation	0.000	0.000	0.000	0.000
Medical_School_Affiliation_Unknown	-0.005	0.000	0.000	0.708
Hospital_Type_Critical Access Hospital	0.074	0.000	0.000	0.124
Hospital_Type_Department of Defense Hospital	0.000	0.000	0.000	0.000
Hospital_Type_Long Term Acute Care Hospital	0.048	0.000	0.000	0.000
Hospital_Type_Psychiatric Hospital	0.067	0.000	0.000	0.000
Hospital_Type_Rehabilitation Hospital	0.058	0.000	0.000	0.000
Hospital_Type_Short Term Acute Care Hospital	0.084	0.000	0.006	0.000
State_AL	0.005	0.000	0.000	0.178
State_AR	0.003	0.000	0.000	0.479
State_AZ	-0.002	0.000	0.000	0.659
State_CA	0.000	0.000	0.000	0.985
State_CO	-0.003	0.000	0.000	0.457
State_CT	0.013	0.000	0.000	0.002
State_DC	0.006	0.000	0.000	0.273
State_DE	0.018	0.000	0.000	0.004
State_FL	0.006	0.000	0.000	0.081
State_GA	0.009	0.000	0.000	0.014
State_HI	-0.003	0.000	0.000	0.531
State_IA	0.001	0.000	0.000	0.734
State_ID	0.000	0.000	0.000	0.983
State_IL	0.009	0.000	0.000	0.011
State_IN	0.006	0.000	0.000	0.081
State_KS	0.002	0.000	0.000	0.499
State_KY	0.005	0.000	0.000	0.153
State_LA	0.006	0.000	0.000	0.097
State_MA	0.010	0.000	0.000	0.007
State_MD	0.013	0.000	0.000	0.001
State_ME	-0.002	0.000	0.000	0.605
State_MI	0.013	0.000	0.000	0.000
State_MN	0.003	0.000	0.000	0.432
State_MO	0.005	0.000	0.000	0.131
State_MS	0.003	0.000	0.000	0.466
State_MT	0.004	0.000	0.000	0.369
State_NC	0.016	0.000	0.000	0.000
State_ND	0.004	0.000	0.000	0.390
State_NE	0.001	0.000	0.000	0.859
State_NH	0.002	0.000	0.000	0.643
State_NJ	0.012	0.000	0.000	0.001
State_NM	-0.002	0.000	0.000	0.634
State_NV	0.007	0.000	0.000	0.105
State_NY	-0.005	0.000	0.000	0.136
State_OH	0.007	0.000	0.000	0.038
State_OK	0.001	0.000	0.000	0.808
State_OR	-0.002	0.000	0.000	0.691
State_PA	0.002	0.000	0.000	0.588
State_RI	0.002	0.000	0.000	0.741
State_SC	0.006	0.000	0.000	0.117
State_SD	-0.002	0.000	0.000	0.632
State_TN	0.003	0.000	0.000	0.442
State_TX	0.004	0.000	0.000	0.231
State_UT	-0.004	0.000	0.000	0.266
State_VA	0.014	0.000	0.000	0.000
State_VT	-0.001	0.000	0.000	0.891
State_WA	0.005	0.000	0.000	0.153
State_WI	0.003	0.000	0.000	0.472
State_WV	0.002	0.000	0.000	0.579
State_WY	0.002	0.000	0.000	0.692
Urban_Rural_Urban	0.004	0.000	0.000	0.000
Year_Y17	0.003	0.000	0.000	0.000
Year_Y18	0.004	0.000	0.000	0.000
DRG_DRG292	-0.040	0.000	-0.016	0.000
DRG_DRG293	-0.056	-0.013	-0.029	0.000

#############################################################################

Heart Failure Analysis, Part 1

Doc Larry

October 8, 2020