Load Libraries
require(caret)
## Loading required package: caret
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
require(randomForest)
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
require(kableExtra)
## Loading required package: kableExtra
## Warning: package 'kableExtra' was built under R version 4.0.5
require(magrittr)
## Loading required package: magrittr
## Warning: package 'magrittr' was built under R version 4.0.5
Read Data
mydata=read.csv("D:/bankrupt.csv", stringsAsFactors = T) #also on my desktop
library(Amelia)
## Warning: package 'Amelia' was built under R version 4.0.5
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 4.0.5
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.0, built: 2021-05-26)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(mydata)

sz=nrow(mydata)
tmp=rep(0,ncol(mydata))
tmp2=rep(0,ncol(mydata))
for (i in 1:ncol(mydata)){
tmp[i]=sum(is.na(mydata[,i]))/sz}
for (i in 1:nrow(mydata)){
tmp2[i]=sum(is.na(mydata[i,]))/ncol(mydata)
}
sub=which(tmp>.10)
print(colnames(mydata[,sub]))
## [1] "Hospital_Compare" "HCAHPS_Star_Rating"
mydata[,sub]=NULL
sub2=which(tmp2>.10)
print(length(sub2))
## [1] 227
mydata=mydata[-sub2,]
missmap(mydata)

dim(mydata)
## [1] 2995 32
Describe
library(psych)
## Warning: package 'psych' was built under R version 4.0.5
##
## Attaching package: 'psych'
## The following object is masked from 'package:randomForest':
##
## outlier
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
table(mydata$Bankrupt)%>%kbl()%>%kable_classic(html_font='Cambria')
describe(mydata)%>%kbl()%>%kable_classic(html_font='Cambria')
|
|
vars
|
n
|
mean
|
sd
|
median
|
trimmed
|
mad
|
min
|
max
|
range
|
skew
|
kurtosis
|
se
|
|
Bankrupt
|
1
|
2995
|
9.015000e-03
|
9.453430e-02
|
0.00
|
0.000000e+00
|
0.000000e+00
|
0.000000e+00
|
1.000000e+00
|
1.000000e+00
|
10.3839753
|
105.8622905
|
1.727400e-03
|
|
Medical_Center
|
2
|
2995
|
5.876460e-02
|
2.352229e-01
|
0.00
|
0.000000e+00
|
0.000000e+00
|
0.000000e+00
|
1.000000e+00
|
1.000000e+00
|
3.7503841
|
12.0694109
|
4.298100e-03
|
|
For_Profit
|
3
|
2995
|
2.393990e-01
|
4.267879e-01
|
0.00
|
1.743846e-01
|
0.000000e+00
|
0.000000e+00
|
1.000000e+00
|
1.000000e+00
|
1.2208124
|
-0.5097870
|
7.798500e-03
|
|
Government
|
4
|
2995
|
1.375626e-01
|
3.444978e-01
|
0.00
|
4.714230e-02
|
0.000000e+00
|
0.000000e+00
|
1.000000e+00
|
1.000000e+00
|
2.1034465
|
2.4252972
|
6.294900e-03
|
|
Sole_Community_Hospital
|
5
|
2995
|
1.549249e-01
|
3.618935e-01
|
0.00
|
6.883600e-02
|
0.000000e+00
|
0.000000e+00
|
1.000000e+00
|
1.000000e+00
|
1.9064168
|
1.6349714
|
6.612800e-03
|
|
Market_Concentration
|
6
|
2792
|
3.483596e-01
|
3.257385e-01
|
0.24
|
3.072381e-01
|
2.816940e-01
|
2.000000e-02
|
1.000000e+00
|
9.800000e-01
|
0.9130595
|
-0.4759151
|
6.164700e-03
|
|
Case_Mix_Index
|
7
|
2992
|
1.641825e+00
|
3.602208e-01
|
1.60
|
1.612135e+00
|
2.816940e-01
|
7.800000e-01
|
5.260000e+00
|
4.480000e+00
|
1.6589389
|
8.1818123
|
6.585500e-03
|
|
Staffed_Beds
|
8
|
2994
|
1.967184e+02
|
1.876464e+02
|
140.00
|
1.645033e+02
|
1.349166e+02
|
1.000000e+00
|
2.247000e+03
|
2.246000e+03
|
2.4784522
|
11.2903816
|
3.429369e+00
|
|
Urban
|
9
|
2995
|
3.208681e-01
|
4.668882e-01
|
0.00
|
2.761786e-01
|
0.000000e+00
|
0.000000e+00
|
1.000000e+00
|
1.000000e+00
|
0.7670877
|
-1.4120476
|
8.531300e-03
|
|
HCAHPS_Recommend
|
10
|
2973
|
7.055096e-01
|
9.621110e-02
|
0.71
|
7.077470e-01
|
8.895600e-02
|
2.700000e-01
|
1.000000e+00
|
7.300000e-01
|
-0.2775591
|
0.2855953
|
1.764500e-03
|
|
Joint_Commission_Certified*
|
11
|
2995
|
1.231720e+00
|
4.227920e-01
|
1.00
|
1.164372e+00
|
0.000000e+00
|
1.000000e+00
|
3.000000e+00
|
2.000000e+00
|
1.2842904
|
-0.3091751
|
7.725500e-03
|
|
Medicare_Days
|
12
|
2995
|
3.599800e-01
|
1.234220e-01
|
0.36
|
3.570713e-01
|
1.186080e-01
|
0.000000e+00
|
8.700000e-01
|
8.700000e-01
|
0.2919153
|
0.4718679
|
2.255200e-03
|
|
Medicaid_Days
|
13
|
2808
|
9.194090e-02
|
8.651570e-02
|
0.06
|
7.753110e-02
|
5.930400e-02
|
0.000000e+00
|
7.600000e-01
|
7.600000e-01
|
1.8402055
|
4.6648706
|
1.632700e-03
|
|
Other_Pay_Days
|
14
|
2995
|
5.553456e-01
|
1.329797e-01
|
0.57
|
5.604631e-01
|
1.334340e-01
|
4.000000e-02
|
1.000000e+00
|
9.600000e-01
|
-0.3554933
|
0.2526298
|
2.429900e-03
|
|
Serious_Complication_Rate
|
15
|
2977
|
9.906550e-01
|
1.958206e-01
|
0.96
|
9.688250e-01
|
1.334340e-01
|
4.600000e-01
|
4.370000e+00
|
3.910000e+00
|
3.4441294
|
36.6912425
|
3.589000e-03
|
|
Bed_Utilization_Rate
|
16
|
2995
|
4.975593e-01
|
2.000358e-01
|
0.51
|
5.006550e-01
|
2.223900e-01
|
2.000000e-02
|
9.700000e-01
|
9.500000e-01
|
-0.1232727
|
-0.7924340
|
3.655200e-03
|
|
Facility_Age
|
17
|
2756
|
1.334235e+01
|
9.617723e+00
|
11.74
|
1.202536e+01
|
7.079415e+00
|
1.070000e+00
|
7.446000e+01
|
7.339000e+01
|
2.1365631
|
7.3900359
|
1.832030e-01
|
|
Affiliated_Physicians
|
18
|
2995
|
3.417092e+02
|
4.222551e+02
|
200.00
|
2.556191e+02
|
1.927380e+02
|
3.000000e+00
|
4.990000e+03
|
4.987000e+03
|
3.5518308
|
20.0452924
|
7.715720e+00
|
|
Current_Ratio
|
19
|
2995
|
6.475793e+00
|
1.239486e+02
|
1.90
|
2.129161e+00
|
1.482600e+00
|
-1.774000e+02
|
5.102400e+03
|
5.279800e+03
|
38.2792423
|
1487.0324946
|
2.264870e+00
|
|
Quick_Ratio
|
20
|
2995
|
6.254825e+00
|
1.229290e+02
|
1.70
|
1.956237e+00
|
1.334340e+00
|
-1.801000e+02
|
5.064900e+03
|
5.245000e+03
|
38.3097451
|
1488.9807961
|
2.246239e+00
|
|
Total_Liabilities
|
21
|
2995
|
1.698386e+08
|
5.759768e+08
|
50794306.00
|
8.858616e+07
|
6.922136e+07
|
-2.904918e+09
|
7.234705e+09
|
1.013962e+10
|
6.8787080
|
67.8477597
|
1.052462e+07
|
|
Total_Assets
|
22
|
2995
|
4.314464e+08
|
1.194589e+09
|
152991466.00
|
2.283658e+08
|
1.763068e+08
|
-4.229379e+08
|
1.893337e+10
|
1.935631e+10
|
10.1011292
|
127.4417386
|
2.182830e+07
|
|
Cash_on_Hand
|
23
|
2956
|
2.684467e+07
|
1.417530e+08
|
1819389.50
|
7.614012e+06
|
2.888817e+06
|
-2.512770e+09
|
3.880477e+09
|
6.393247e+09
|
8.5511882
|
270.3583245
|
2.607234e+06
|
|
Days_Cash_on_Hand
|
24
|
2956
|
4.707240e+01
|
1.440871e+02
|
9.20
|
2.213107e+01
|
1.378818e+01
|
-6.908000e+02
|
2.318000e+03
|
3.008800e+03
|
7.4745146
|
82.2078598
|
2.650165e+00
|
|
Net_Patient_Revenue
|
25
|
2995
|
3.085030e+08
|
4.419112e+08
|
169543117.00
|
2.185648e+08
|
1.689035e+08
|
-6.128715e+07
|
5.951047e+09
|
6.012334e+09
|
4.6189484
|
33.5538639
|
8.074889e+06
|
|
Operating_Income
|
26
|
2995
|
-1.205930e+06
|
9.312949e+07
|
431230.00
|
2.637141e+06
|
1.860530e+07
|
-1.191563e+09
|
1.218900e+09
|
2.410463e+09
|
-3.2332996
|
60.2933855
|
1.701723e+06
|
|
Accounts_Receivable
|
27
|
2981
|
1.084387e+08
|
2.018664e+08
|
47289611.00
|
6.692297e+07
|
5.088041e+07
|
-7.458905e+07
|
3.711121e+09
|
3.785710e+09
|
6.4448731
|
67.2093898
|
3.697285e+06
|
|
Unreimbursed_Uncompensated_Care
|
28
|
2984
|
2.093134e+07
|
3.908247e+07
|
10445219.50
|
1.373891e+07
|
1.082819e+07
|
-2.191300e+04
|
6.791545e+08
|
6.791764e+08
|
7.8031805
|
91.5304318
|
7.154554e+05
|
|
Debt_Equty_Ratio
|
29
|
2778
|
1.233917e+00
|
2.270931e+01
|
0.20
|
3.118930e-01
|
4.225410e-01
|
-1.792500e+02
|
7.419500e+02
|
9.212000e+02
|
24.3093004
|
744.1925412
|
4.308616e-01
|
|
Labor_Compensation_Ratio
|
30
|
2995
|
4.420935e-01
|
1.686432e-01
|
0.42
|
4.290280e-01
|
1.334340e-01
|
-1.440000e+00
|
3.490000e+00
|
4.930000e+00
|
3.0444778
|
50.6653874
|
3.081600e-03
|
|
Net_Operating_Profit_Margin
|
31
|
2995
|
-2.399670e-02
|
2.997157e-01
|
0.00
|
1.502000e-04
|
1.186080e-01
|
-8.350000e+00
|
4.420000e+00
|
1.277000e+01
|
-9.3160650
|
250.2882616
|
5.476600e-03
|
|
Asset_Turnover_Ratio*
|
32
|
2995
|
1.593908e+03
|
9.125296e+02
|
1589.00
|
1.593069e+03
|
1.169771e+03
|
3.000000e+00
|
3.191000e+03
|
3.188000e+03
|
0.0060707
|
-1.1948796
|
1.667434e+01
|
Convert Data to Numeric
mydata$Asset_Turnover_Ratio=as.numeric(mydata$Asset_Turnover_Ratio)
mydata$Joint_Commission_Certified=as.numeric(mydata$Joint_Commission_Certified)
Impute Missing
for(i in 1:ncol(mydata)){
mydata[is.na(mydata[,i]), i] <- median(mydata[,i], na.rm = TRUE)
}
Split and Scale
#Splitting data in train and test data
set.seed(123)
N=nrow(mydata)
mys=sample(1:N,round(.5*N,0), replace=T)
train=mydata[mys,]
test=mydata[-mys,]
c(nrow(train),nrow(test))
## [1] 1498 1818
library(caret)
transform=preProcess(train, method=c("range"))
train=predict(transform,train)
test=predict(transform,test)
Oversample
newtmp=train[train$Bankrupt==1,]
myrows=nrow(newtmp)
set.seed(2)
mys2=sample(1:myrows, 3000,replace=T)
tmp=newtmp[mys2,]
Undersample
newtmp=train[train$Bankrupt==0,]
myrows=nrow(newtmp)
set.seed(3)
mys2=sample(1:myrows,1000,replace=T)
train=rbind(newtmp[mys2,], tmp)
Log Regression
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.0.5
## Loading required package: Matrix
## Loaded glmnet 4.1-2
myglm=glm(Bankrupt~., family = binomial(link='logit'), data=train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
mypred=round(predict(myglm,test[,-1], type="response"),0)
confusionMatrix(as.factor(mypred), as.factor(test$Bankrupt), positive="1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1624 4
## 1 179 11
##
## Accuracy : 0.8993
## 95% CI : (0.8846, 0.9128)
## No Information Rate : 0.9917
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0935
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.733333
## Specificity : 0.900721
## Pos Pred Value : 0.057895
## Neg Pred Value : 0.997543
## Prevalence : 0.008251
## Detection Rate : 0.006051
## Detection Prevalence : 0.104510
## Balanced Accuracy : 0.817027
##
## 'Positive' Class : 1
##
Fit Random Forest
ctrl=trainControl(method = "LOOCV")
# Fitting Random Forest to the train dataset
set.seed(1) # Setting seed
myRF = randomForest(x = train[-1], y = as.factor(train$Bankrupt),
ntree=100, mtry=3,trControl=ctrl)
myRF
##
## Call:
## randomForest(x = train[-1], y = as.factor(train$Bankrupt), ntree = 100, mtry = 3, trControl = ctrl)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 0%
## Confusion matrix:
## 0 1 class.error
## 0 1000 0 0
## 1 0 3000 0
RF Confusion Matrix and Metrics for Test Set
y_pred = predict(myRF, newdata = test[-1])
confusionMatrix(y_pred, as.factor(test$Bankrupt), positive="1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1802 14
## 1 1 1
##
## Accuracy : 0.9917
## 95% CI : (0.9864, 0.9954)
## No Information Rate : 0.9917
## P-Value [Acc > NIR] : 0.568090
##
## Kappa : 0.1159
##
## Mcnemar's Test P-Value : 0.001946
##
## Sensitivity : 0.0666667
## Specificity : 0.9994454
## Pos Pred Value : 0.5000000
## Neg Pred Value : 0.9922907
## Prevalence : 0.0082508
## Detection Rate : 0.0005501
## Detection Prevalence : 0.0011001
## Balanced Accuracy : 0.5330560
##
## 'Positive' Class : 1
##
Fit XGBoost
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.0.5
bst<- xgboost(data = as.matrix(train[,-1]), label = train$Bankrupt, max.depth = 3, eta = .01, nthread = 2, nrounds = 5000, objective = "binary:logistic", verbose=0)
## [09:30:50] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
pred <- predict(bst, as.matrix(test[,-1]))
err <- mean(as.numeric(pred > 0.5) != test$Bankrupt)
err
## [1] 0.006050605
confusionMatrix(as.factor(round(pred,0)), as.factor(test$Bankrupt), positive='1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1799 7
## 1 4 8
##
## Accuracy : 0.9939
## 95% CI : (0.9892, 0.997)
## No Information Rate : 0.9917
## P-Value [Acc > NIR] : 0.1837
##
## Kappa : 0.5896
##
## Mcnemar's Test P-Value : 0.5465
##
## Sensitivity : 0.533333
## Specificity : 0.997781
## Pos Pred Value : 0.666667
## Neg Pred Value : 0.996124
## Prevalence : 0.008251
## Detection Rate : 0.004400
## Detection Prevalence : 0.006601
## Balanced Accuracy : 0.765557
##
## 'Positive' Class : 1
##
Produce Classification Plots & Importance Metrics
#xgboost
xgb.plot.tree(model = bst, trees = 1)
importance_matrix =xgb.importance(colnames(train[,-1]), model = bst)
xgb.ggplot.importance(importance_matrix)

# Plotting model
plot(myRF)

# Importance plot
importance(myRF)
## MeanDecreaseGini
## Medical_Center 1.7104464
## For_Profit 30.6289000
## Government 3.6210719
## Sole_Community_Hospital 7.9621004
## Market_Concentration 26.5916447
## Case_Mix_Index 63.1194109
## Staffed_Beds 50.7836766
## Urban 0.5140279
## HCAHPS_Recommend 41.9967584
## Joint_Commission_Certified 3.7460658
## Medicare_Days 39.1492713
## Medicaid_Days 27.1875925
## Other_Pay_Days 17.9879686
## Serious_Complication_Rate 56.7894952
## Bed_Utilization_Rate 65.6773390
## Facility_Age 41.5947709
## Affiliated_Physicians 30.5935444
## Current_Ratio 85.5537365
## Quick_Ratio 91.0658275
## Total_Liabilities 137.2097256
## Total_Assets 109.1168123
## Cash_on_Hand 47.6951714
## Days_Cash_on_Hand 37.6601491
## Net_Patient_Revenue 81.3756940
## Operating_Income 41.5810777
## Accounts_Receivable 39.9215887
## Unreimbursed_Uncompensated_Care 144.5551548
## Debt_Equty_Ratio 64.6525918
## Labor_Compensation_Ratio 25.7900284
## Net_Operating_Profit_Margin 28.4190374
## Asset_Turnover_Ratio 57.9490051
# Variable importance plot
varImpPlot(myRF)

newdata=mydata[mydata$Unreimbursed_Uncompensated_Care<.01,]
boxplot(Unreimbursed_Uncompensated_Care~Bankrupt, data=newdata, horizontal=T, col=c('blue','red'), notch=T)

newdata=mydata[mydata$Total_Assets<.1,]
boxplot(Total_Assets~Bankrupt, data=newdata, horizontal=T, col=c('blue','red'), notch=T)
## Warning in bxp(list(stats = structure(c(-92478391, -63979036.5, -17207671, :
## some notches went outside hinges ('box'): maybe set notch=FALSE
