Load Libraries

require(caret)
## Loading required package: caret
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
require(randomForest)
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
require(kableExtra)
## Loading required package: kableExtra
## Warning: package 'kableExtra' was built under R version 4.0.5
require(magrittr)
## Loading required package: magrittr
## Warning: package 'magrittr' was built under R version 4.0.5

Read Data

mydata=read.csv("D:/bankrupt.csv", stringsAsFactors = T) #also on my desktop

library(Amelia)
## Warning: package 'Amelia' was built under R version 4.0.5
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 4.0.5
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.0, built: 2021-05-26)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(mydata)

sz=nrow(mydata)
tmp=rep(0,ncol(mydata))
tmp2=rep(0,ncol(mydata))

for (i in 1:ncol(mydata)){
  tmp[i]=sum(is.na(mydata[,i]))/sz}

for (i in 1:nrow(mydata)){
  tmp2[i]=sum(is.na(mydata[i,]))/ncol(mydata)
  }

sub=which(tmp>.10)
print(colnames(mydata[,sub]))
## [1] "Hospital_Compare"   "HCAHPS_Star_Rating"
mydata[,sub]=NULL
sub2=which(tmp2>.10)
print(length(sub2))
## [1] 227
mydata=mydata[-sub2,]
missmap(mydata)

dim(mydata)
## [1] 2995   32

Describe

library(psych)
## Warning: package 'psych' was built under R version 4.0.5
## 
## Attaching package: 'psych'
## The following object is masked from 'package:randomForest':
## 
##     outlier
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
table(mydata$Bankrupt)%>%kbl()%>%kable_classic(html_font='Cambria')
Var1 Freq
0 2968
1 27
describe(mydata)%>%kbl()%>%kable_classic(html_font='Cambria')
vars n mean sd median trimmed mad min max range skew kurtosis se
Bankrupt 1 2995 9.015000e-03 9.453430e-02 0.00 0.000000e+00 0.000000e+00 0.000000e+00 1.000000e+00 1.000000e+00 10.3839753 105.8622905 1.727400e-03
Medical_Center 2 2995 5.876460e-02 2.352229e-01 0.00 0.000000e+00 0.000000e+00 0.000000e+00 1.000000e+00 1.000000e+00 3.7503841 12.0694109 4.298100e-03
For_Profit 3 2995 2.393990e-01 4.267879e-01 0.00 1.743846e-01 0.000000e+00 0.000000e+00 1.000000e+00 1.000000e+00 1.2208124 -0.5097870 7.798500e-03
Government 4 2995 1.375626e-01 3.444978e-01 0.00 4.714230e-02 0.000000e+00 0.000000e+00 1.000000e+00 1.000000e+00 2.1034465 2.4252972 6.294900e-03
Sole_Community_Hospital 5 2995 1.549249e-01 3.618935e-01 0.00 6.883600e-02 0.000000e+00 0.000000e+00 1.000000e+00 1.000000e+00 1.9064168 1.6349714 6.612800e-03
Market_Concentration 6 2792 3.483596e-01 3.257385e-01 0.24 3.072381e-01 2.816940e-01 2.000000e-02 1.000000e+00 9.800000e-01 0.9130595 -0.4759151 6.164700e-03
Case_Mix_Index 7 2992 1.641825e+00 3.602208e-01 1.60 1.612135e+00 2.816940e-01 7.800000e-01 5.260000e+00 4.480000e+00 1.6589389 8.1818123 6.585500e-03
Staffed_Beds 8 2994 1.967184e+02 1.876464e+02 140.00 1.645033e+02 1.349166e+02 1.000000e+00 2.247000e+03 2.246000e+03 2.4784522 11.2903816 3.429369e+00
Urban 9 2995 3.208681e-01 4.668882e-01 0.00 2.761786e-01 0.000000e+00 0.000000e+00 1.000000e+00 1.000000e+00 0.7670877 -1.4120476 8.531300e-03
HCAHPS_Recommend 10 2973 7.055096e-01 9.621110e-02 0.71 7.077470e-01 8.895600e-02 2.700000e-01 1.000000e+00 7.300000e-01 -0.2775591 0.2855953 1.764500e-03
Joint_Commission_Certified* 11 2995 1.231720e+00 4.227920e-01 1.00 1.164372e+00 0.000000e+00 1.000000e+00 3.000000e+00 2.000000e+00 1.2842904 -0.3091751 7.725500e-03
Medicare_Days 12 2995 3.599800e-01 1.234220e-01 0.36 3.570713e-01 1.186080e-01 0.000000e+00 8.700000e-01 8.700000e-01 0.2919153 0.4718679 2.255200e-03
Medicaid_Days 13 2808 9.194090e-02 8.651570e-02 0.06 7.753110e-02 5.930400e-02 0.000000e+00 7.600000e-01 7.600000e-01 1.8402055 4.6648706 1.632700e-03
Other_Pay_Days 14 2995 5.553456e-01 1.329797e-01 0.57 5.604631e-01 1.334340e-01 4.000000e-02 1.000000e+00 9.600000e-01 -0.3554933 0.2526298 2.429900e-03
Serious_Complication_Rate 15 2977 9.906550e-01 1.958206e-01 0.96 9.688250e-01 1.334340e-01 4.600000e-01 4.370000e+00 3.910000e+00 3.4441294 36.6912425 3.589000e-03
Bed_Utilization_Rate 16 2995 4.975593e-01 2.000358e-01 0.51 5.006550e-01 2.223900e-01 2.000000e-02 9.700000e-01 9.500000e-01 -0.1232727 -0.7924340 3.655200e-03
Facility_Age 17 2756 1.334235e+01 9.617723e+00 11.74 1.202536e+01 7.079415e+00 1.070000e+00 7.446000e+01 7.339000e+01 2.1365631 7.3900359 1.832030e-01
Affiliated_Physicians 18 2995 3.417092e+02 4.222551e+02 200.00 2.556191e+02 1.927380e+02 3.000000e+00 4.990000e+03 4.987000e+03 3.5518308 20.0452924 7.715720e+00
Current_Ratio 19 2995 6.475793e+00 1.239486e+02 1.90 2.129161e+00 1.482600e+00 -1.774000e+02 5.102400e+03 5.279800e+03 38.2792423 1487.0324946 2.264870e+00
Quick_Ratio 20 2995 6.254825e+00 1.229290e+02 1.70 1.956237e+00 1.334340e+00 -1.801000e+02 5.064900e+03 5.245000e+03 38.3097451 1488.9807961 2.246239e+00
Total_Liabilities 21 2995 1.698386e+08 5.759768e+08 50794306.00 8.858616e+07 6.922136e+07 -2.904918e+09 7.234705e+09 1.013962e+10 6.8787080 67.8477597 1.052462e+07
Total_Assets 22 2995 4.314464e+08 1.194589e+09 152991466.00 2.283658e+08 1.763068e+08 -4.229379e+08 1.893337e+10 1.935631e+10 10.1011292 127.4417386 2.182830e+07
Cash_on_Hand 23 2956 2.684467e+07 1.417530e+08 1819389.50 7.614012e+06 2.888817e+06 -2.512770e+09 3.880477e+09 6.393247e+09 8.5511882 270.3583245 2.607234e+06
Days_Cash_on_Hand 24 2956 4.707240e+01 1.440871e+02 9.20 2.213107e+01 1.378818e+01 -6.908000e+02 2.318000e+03 3.008800e+03 7.4745146 82.2078598 2.650165e+00
Net_Patient_Revenue 25 2995 3.085030e+08 4.419112e+08 169543117.00 2.185648e+08 1.689035e+08 -6.128715e+07 5.951047e+09 6.012334e+09 4.6189484 33.5538639 8.074889e+06
Operating_Income 26 2995 -1.205930e+06 9.312949e+07 431230.00 2.637141e+06 1.860530e+07 -1.191563e+09 1.218900e+09 2.410463e+09 -3.2332996 60.2933855 1.701723e+06
Accounts_Receivable 27 2981 1.084387e+08 2.018664e+08 47289611.00 6.692297e+07 5.088041e+07 -7.458905e+07 3.711121e+09 3.785710e+09 6.4448731 67.2093898 3.697285e+06
Unreimbursed_Uncompensated_Care 28 2984 2.093134e+07 3.908247e+07 10445219.50 1.373891e+07 1.082819e+07 -2.191300e+04 6.791545e+08 6.791764e+08 7.8031805 91.5304318 7.154554e+05
Debt_Equty_Ratio 29 2778 1.233917e+00 2.270931e+01 0.20 3.118930e-01 4.225410e-01 -1.792500e+02 7.419500e+02 9.212000e+02 24.3093004 744.1925412 4.308616e-01
Labor_Compensation_Ratio 30 2995 4.420935e-01 1.686432e-01 0.42 4.290280e-01 1.334340e-01 -1.440000e+00 3.490000e+00 4.930000e+00 3.0444778 50.6653874 3.081600e-03
Net_Operating_Profit_Margin 31 2995 -2.399670e-02 2.997157e-01 0.00 1.502000e-04 1.186080e-01 -8.350000e+00 4.420000e+00 1.277000e+01 -9.3160650 250.2882616 5.476600e-03
Asset_Turnover_Ratio* 32 2995 1.593908e+03 9.125296e+02 1589.00 1.593069e+03 1.169771e+03 3.000000e+00 3.191000e+03 3.188000e+03 0.0060707 -1.1948796 1.667434e+01

Convert Data to Numeric

mydata$Asset_Turnover_Ratio=as.numeric(mydata$Asset_Turnover_Ratio)
mydata$Joint_Commission_Certified=as.numeric(mydata$Joint_Commission_Certified)

Impute Missing

for(i in 1:ncol(mydata)){
  mydata[is.na(mydata[,i]), i] <- median(mydata[,i], na.rm = TRUE)
}

Split and Scale

#Splitting data in train and test data
set.seed(123)
N=nrow(mydata)
mys=sample(1:N,round(.5*N,0), replace=T)
train=mydata[mys,]
test=mydata[-mys,]
c(nrow(train),nrow(test))
## [1] 1498 1818
library(caret)

transform=preProcess(train, method=c("range"))
train=predict(transform,train)
test=predict(transform,test)

Oversample

newtmp=train[train$Bankrupt==1,]
myrows=nrow(newtmp)
set.seed(2)
mys2=sample(1:myrows, 3000,replace=T)
tmp=newtmp[mys2,]

Undersample

newtmp=train[train$Bankrupt==0,]
myrows=nrow(newtmp)
set.seed(3)
mys2=sample(1:myrows,1000,replace=T)
train=rbind(newtmp[mys2,], tmp)

Log Regression

library(glmnet)
## Warning: package 'glmnet' was built under R version 4.0.5
## Loading required package: Matrix
## Loaded glmnet 4.1-2
myglm=glm(Bankrupt~., family = binomial(link='logit'), data=train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
mypred=round(predict(myglm,test[,-1], type="response"),0)
confusionMatrix(as.factor(mypred), as.factor(test$Bankrupt), positive="1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1624    4
##          1  179   11
##                                           
##                Accuracy : 0.8993          
##                  95% CI : (0.8846, 0.9128)
##     No Information Rate : 0.9917          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0935          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.733333        
##             Specificity : 0.900721        
##          Pos Pred Value : 0.057895        
##          Neg Pred Value : 0.997543        
##              Prevalence : 0.008251        
##          Detection Rate : 0.006051        
##    Detection Prevalence : 0.104510        
##       Balanced Accuracy : 0.817027        
##                                           
##        'Positive' Class : 1               
## 

Fit Random Forest

ctrl=trainControl(method = "LOOCV")
# Fitting Random Forest to the train dataset
set.seed(1)  # Setting seed
myRF = randomForest(x = train[-1], y = as.factor(train$Bankrupt),
             ntree=100, mtry=3,trControl=ctrl)
myRF
## 
## Call:
##  randomForest(x = train[-1], y = as.factor(train$Bankrupt), ntree = 100,      mtry = 3, trControl = ctrl) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 0%
## Confusion matrix:
##      0    1 class.error
## 0 1000    0           0
## 1    0 3000           0

RF Confusion Matrix and Metrics for Test Set

y_pred = predict(myRF, newdata = test[-1])
confusionMatrix(y_pred, as.factor(test$Bankrupt), positive="1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1802   14
##          1    1    1
##                                           
##                Accuracy : 0.9917          
##                  95% CI : (0.9864, 0.9954)
##     No Information Rate : 0.9917          
##     P-Value [Acc > NIR] : 0.568090        
##                                           
##                   Kappa : 0.1159          
##                                           
##  Mcnemar's Test P-Value : 0.001946        
##                                           
##             Sensitivity : 0.0666667       
##             Specificity : 0.9994454       
##          Pos Pred Value : 0.5000000       
##          Neg Pred Value : 0.9922907       
##              Prevalence : 0.0082508       
##          Detection Rate : 0.0005501       
##    Detection Prevalence : 0.0011001       
##       Balanced Accuracy : 0.5330560       
##                                           
##        'Positive' Class : 1               
## 

Fit XGBoost

library(xgboost)
## Warning: package 'xgboost' was built under R version 4.0.5
bst<- xgboost(data = as.matrix(train[,-1]), label = train$Bankrupt, max.depth = 3, eta = .01, nthread = 2, nrounds = 5000, objective = "binary:logistic", verbose=0)
## [09:30:50] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
pred <- predict(bst, as.matrix(test[,-1]))
err <- mean(as.numeric(pred > 0.5) != test$Bankrupt)
err
## [1] 0.006050605
confusionMatrix(as.factor(round(pred,0)), as.factor(test$Bankrupt), positive='1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1799    7
##          1    4    8
##                                          
##                Accuracy : 0.9939         
##                  95% CI : (0.9892, 0.997)
##     No Information Rate : 0.9917         
##     P-Value [Acc > NIR] : 0.1837         
##                                          
##                   Kappa : 0.5896         
##                                          
##  Mcnemar's Test P-Value : 0.5465         
##                                          
##             Sensitivity : 0.533333       
##             Specificity : 0.997781       
##          Pos Pred Value : 0.666667       
##          Neg Pred Value : 0.996124       
##              Prevalence : 0.008251       
##          Detection Rate : 0.004400       
##    Detection Prevalence : 0.006601       
##       Balanced Accuracy : 0.765557       
##                                          
##        'Positive' Class : 1              
## 

Produce Classification Plots & Importance Metrics

#xgboost

xgb.plot.tree(model = bst, trees = 1)
importance_matrix =xgb.importance(colnames(train[,-1]), model = bst)
xgb.ggplot.importance(importance_matrix)

# Plotting model
plot(myRF)

# Importance plot
importance(myRF)
##                                 MeanDecreaseGini
## Medical_Center                         1.7104464
## For_Profit                            30.6289000
## Government                             3.6210719
## Sole_Community_Hospital                7.9621004
## Market_Concentration                  26.5916447
## Case_Mix_Index                        63.1194109
## Staffed_Beds                          50.7836766
## Urban                                  0.5140279
## HCAHPS_Recommend                      41.9967584
## Joint_Commission_Certified             3.7460658
## Medicare_Days                         39.1492713
## Medicaid_Days                         27.1875925
## Other_Pay_Days                        17.9879686
## Serious_Complication_Rate             56.7894952
## Bed_Utilization_Rate                  65.6773390
## Facility_Age                          41.5947709
## Affiliated_Physicians                 30.5935444
## Current_Ratio                         85.5537365
## Quick_Ratio                           91.0658275
## Total_Liabilities                    137.2097256
## Total_Assets                         109.1168123
## Cash_on_Hand                          47.6951714
## Days_Cash_on_Hand                     37.6601491
## Net_Patient_Revenue                   81.3756940
## Operating_Income                      41.5810777
## Accounts_Receivable                   39.9215887
## Unreimbursed_Uncompensated_Care      144.5551548
## Debt_Equty_Ratio                      64.6525918
## Labor_Compensation_Ratio              25.7900284
## Net_Operating_Profit_Margin           28.4190374
## Asset_Turnover_Ratio                  57.9490051
# Variable importance plot
varImpPlot(myRF)

newdata=mydata[mydata$Unreimbursed_Uncompensated_Care<.01,]
boxplot(Unreimbursed_Uncompensated_Care~Bankrupt, data=newdata, horizontal=T, col=c('blue','red'), notch=T)

newdata=mydata[mydata$Total_Assets<.1,]
boxplot(Total_Assets~Bankrupt, data=newdata, horizontal=T, col=c('blue','red'), notch=T)
## Warning in bxp(list(stats = structure(c(-92478391, -63979036.5, -17207671, :
## some notches went outside hinges ('box'): maybe set notch=FALSE