Load Libraries

require(caret)

## Loading required package: caret

## Warning: package 'caret' was built under R version 4.0.5

## Loading required package: lattice

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.0.5

require(randomForest)

## Loading required package: randomForest

## Warning: package 'randomForest' was built under R version 4.0.5

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

require(kableExtra)

## Loading required package: kableExtra

## Warning: package 'kableExtra' was built under R version 4.0.5

require(magrittr)

## Loading required package: magrittr

## Warning: package 'magrittr' was built under R version 4.0.5

Read Data

mydata=read.csv("D:/bankrupt.csv", stringsAsFactors = T) #also on my desktop

library(Amelia)

## Warning: package 'Amelia' was built under R version 4.0.5

## Loading required package: Rcpp

## Warning: package 'Rcpp' was built under R version 4.0.5

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.0, built: 2021-05-26)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

missmap(mydata)

sz=nrow(mydata)
tmp=rep(0,ncol(mydata))
tmp2=rep(0,ncol(mydata))

for (i in 1:ncol(mydata)){
  tmp[i]=sum(is.na(mydata[,i]))/sz}

for (i in 1:nrow(mydata)){
  tmp2[i]=sum(is.na(mydata[i,]))/ncol(mydata)
  }

sub=which(tmp>.10)
print(colnames(mydata[,sub]))

## [1] "Hospital_Compare"   "HCAHPS_Star_Rating"

mydata[,sub]=NULL
sub2=which(tmp2>.10)
print(length(sub2))

## [1] 227

mydata=mydata[-sub2,]
missmap(mydata)

dim(mydata)

## [1] 2995   32

Describe

library(psych)

## Warning: package 'psych' was built under R version 4.0.5

## 
## Attaching package: 'psych'

## The following object is masked from 'package:randomForest':
## 
##     outlier

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

table(mydata$Bankrupt)%>%kbl()%>%kable_classic(html_font='Cambria')

Var1	Freq
0	2968
1	27

describe(mydata)%>%kbl()%>%kable_classic(html_font='Cambria')

	vars	n	mean	sd	median	trimmed	mad	min	max	range	skew	kurtosis	se
Bankrupt	1	2995	9.015000e-03	9.453430e-02	0.00	0.000000e+00	0.000000e+00	0.000000e+00	1.000000e+00	1.000000e+00	10.3839753	105.8622905	1.727400e-03
Medical_Center	2	2995	5.876460e-02	2.352229e-01	0.00	0.000000e+00	0.000000e+00	0.000000e+00	1.000000e+00	1.000000e+00	3.7503841	12.0694109	4.298100e-03
For_Profit	3	2995	2.393990e-01	4.267879e-01	0.00	1.743846e-01	0.000000e+00	0.000000e+00	1.000000e+00	1.000000e+00	1.2208124	-0.5097870	7.798500e-03
Government	4	2995	1.375626e-01	3.444978e-01	0.00	4.714230e-02	0.000000e+00	0.000000e+00	1.000000e+00	1.000000e+00	2.1034465	2.4252972	6.294900e-03
Sole_Community_Hospital	5	2995	1.549249e-01	3.618935e-01	0.00	6.883600e-02	0.000000e+00	0.000000e+00	1.000000e+00	1.000000e+00	1.9064168	1.6349714	6.612800e-03
Market_Concentration	6	2792	3.483596e-01	3.257385e-01	0.24	3.072381e-01	2.816940e-01	2.000000e-02	1.000000e+00	9.800000e-01	0.9130595	-0.4759151	6.164700e-03
Case_Mix_Index	7	2992	1.641825e+00	3.602208e-01	1.60	1.612135e+00	2.816940e-01	7.800000e-01	5.260000e+00	4.480000e+00	1.6589389	8.1818123	6.585500e-03
Staffed_Beds	8	2994	1.967184e+02	1.876464e+02	140.00	1.645033e+02	1.349166e+02	1.000000e+00	2.247000e+03	2.246000e+03	2.4784522	11.2903816	3.429369e+00
Urban	9	2995	3.208681e-01	4.668882e-01	0.00	2.761786e-01	0.000000e+00	0.000000e+00	1.000000e+00	1.000000e+00	0.7670877	-1.4120476	8.531300e-03
HCAHPS_Recommend	10	2973	7.055096e-01	9.621110e-02	0.71	7.077470e-01	8.895600e-02	2.700000e-01	1.000000e+00	7.300000e-01	-0.2775591	0.2855953	1.764500e-03
Joint_Commission_Certified*	11	2995	1.231720e+00	4.227920e-01	1.00	1.164372e+00	0.000000e+00	1.000000e+00	3.000000e+00	2.000000e+00	1.2842904	-0.3091751	7.725500e-03
Medicare_Days	12	2995	3.599800e-01	1.234220e-01	0.36	3.570713e-01	1.186080e-01	0.000000e+00	8.700000e-01	8.700000e-01	0.2919153	0.4718679	2.255200e-03
Medicaid_Days	13	2808	9.194090e-02	8.651570e-02	0.06	7.753110e-02	5.930400e-02	0.000000e+00	7.600000e-01	7.600000e-01	1.8402055	4.6648706	1.632700e-03
Other_Pay_Days	14	2995	5.553456e-01	1.329797e-01	0.57	5.604631e-01	1.334340e-01	4.000000e-02	1.000000e+00	9.600000e-01	-0.3554933	0.2526298	2.429900e-03
Serious_Complication_Rate	15	2977	9.906550e-01	1.958206e-01	0.96	9.688250e-01	1.334340e-01	4.600000e-01	4.370000e+00	3.910000e+00	3.4441294	36.6912425	3.589000e-03
Bed_Utilization_Rate	16	2995	4.975593e-01	2.000358e-01	0.51	5.006550e-01	2.223900e-01	2.000000e-02	9.700000e-01	9.500000e-01	-0.1232727	-0.7924340	3.655200e-03
Facility_Age	17	2756	1.334235e+01	9.617723e+00	11.74	1.202536e+01	7.079415e+00	1.070000e+00	7.446000e+01	7.339000e+01	2.1365631	7.3900359	1.832030e-01
Affiliated_Physicians	18	2995	3.417092e+02	4.222551e+02	200.00	2.556191e+02	1.927380e+02	3.000000e+00	4.990000e+03	4.987000e+03	3.5518308	20.0452924	7.715720e+00
Current_Ratio	19	2995	6.475793e+00	1.239486e+02	1.90	2.129161e+00	1.482600e+00	-1.774000e+02	5.102400e+03	5.279800e+03	38.2792423	1487.0324946	2.264870e+00
Quick_Ratio	20	2995	6.254825e+00	1.229290e+02	1.70	1.956237e+00	1.334340e+00	-1.801000e+02	5.064900e+03	5.245000e+03	38.3097451	1488.9807961	2.246239e+00
Total_Liabilities	21	2995	1.698386e+08	5.759768e+08	50794306.00	8.858616e+07	6.922136e+07	-2.904918e+09	7.234705e+09	1.013962e+10	6.8787080	67.8477597	1.052462e+07
Total_Assets	22	2995	4.314464e+08	1.194589e+09	152991466.00	2.283658e+08	1.763068e+08	-4.229379e+08	1.893337e+10	1.935631e+10	10.1011292	127.4417386	2.182830e+07
Cash_on_Hand	23	2956	2.684467e+07	1.417530e+08	1819389.50	7.614012e+06	2.888817e+06	-2.512770e+09	3.880477e+09	6.393247e+09	8.5511882	270.3583245	2.607234e+06
Days_Cash_on_Hand	24	2956	4.707240e+01	1.440871e+02	9.20	2.213107e+01	1.378818e+01	-6.908000e+02	2.318000e+03	3.008800e+03	7.4745146	82.2078598	2.650165e+00
Net_Patient_Revenue	25	2995	3.085030e+08	4.419112e+08	169543117.00	2.185648e+08	1.689035e+08	-6.128715e+07	5.951047e+09	6.012334e+09	4.6189484	33.5538639	8.074889e+06
Operating_Income	26	2995	-1.205930e+06	9.312949e+07	431230.00	2.637141e+06	1.860530e+07	-1.191563e+09	1.218900e+09	2.410463e+09	-3.2332996	60.2933855	1.701723e+06
Accounts_Receivable	27	2981	1.084387e+08	2.018664e+08	47289611.00	6.692297e+07	5.088041e+07	-7.458905e+07	3.711121e+09	3.785710e+09	6.4448731	67.2093898	3.697285e+06
Unreimbursed_Uncompensated_Care	28	2984	2.093134e+07	3.908247e+07	10445219.50	1.373891e+07	1.082819e+07	-2.191300e+04	6.791545e+08	6.791764e+08	7.8031805	91.5304318	7.154554e+05
Debt_Equty_Ratio	29	2778	1.233917e+00	2.270931e+01	0.20	3.118930e-01	4.225410e-01	-1.792500e+02	7.419500e+02	9.212000e+02	24.3093004	744.1925412	4.308616e-01
Labor_Compensation_Ratio	30	2995	4.420935e-01	1.686432e-01	0.42	4.290280e-01	1.334340e-01	-1.440000e+00	3.490000e+00	4.930000e+00	3.0444778	50.6653874	3.081600e-03
Net_Operating_Profit_Margin	31	2995	-2.399670e-02	2.997157e-01	0.00	1.502000e-04	1.186080e-01	-8.350000e+00	4.420000e+00	1.277000e+01	-9.3160650	250.2882616	5.476600e-03
Asset_Turnover_Ratio*	32	2995	1.593908e+03	9.125296e+02	1589.00	1.593069e+03	1.169771e+03	3.000000e+00	3.191000e+03	3.188000e+03	0.0060707	-1.1948796	1.667434e+01

Convert Data to Numeric

mydata$Asset_Turnover_Ratio=as.numeric(mydata$Asset_Turnover_Ratio)
mydata$Joint_Commission_Certified=as.numeric(mydata$Joint_Commission_Certified)

Impute Missing

for(i in 1:ncol(mydata)){
  mydata[is.na(mydata[,i]), i] <- median(mydata[,i], na.rm = TRUE)
}

Split and Scale

#Splitting data in train and test data
set.seed(123)
N=nrow(mydata)
mys=sample(1:N,round(.5*N,0), replace=T)
train=mydata[mys,]
test=mydata[-mys,]
c(nrow(train),nrow(test))

## [1] 1498 1818

library(caret)

transform=preProcess(train, method=c("range"))
train=predict(transform,train)
test=predict(transform,test)

Oversample

newtmp=train[train$Bankrupt==1,]
myrows=nrow(newtmp)
set.seed(2)
mys2=sample(1:myrows, 3000,replace=T)
tmp=newtmp[mys2,]

Undersample

newtmp=train[train$Bankrupt==0,]
myrows=nrow(newtmp)
set.seed(3)
mys2=sample(1:myrows,1000,replace=T)
train=rbind(newtmp[mys2,], tmp)

Log Regression

library(glmnet)

## Warning: package 'glmnet' was built under R version 4.0.5

## Loading required package: Matrix

## Loaded glmnet 4.1-2

myglm=glm(Bankrupt~., family = binomial(link='logit'), data=train)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

mypred=round(predict(myglm,test[,-1], type="response"),0)
confusionMatrix(as.factor(mypred), as.factor(test$Bankrupt), positive="1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1624    4
##          1  179   11
##                                           
##                Accuracy : 0.8993          
##                  95% CI : (0.8846, 0.9128)
##     No Information Rate : 0.9917          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0935          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.733333        
##             Specificity : 0.900721        
##          Pos Pred Value : 0.057895        
##          Neg Pred Value : 0.997543        
##              Prevalence : 0.008251        
##          Detection Rate : 0.006051        
##    Detection Prevalence : 0.104510        
##       Balanced Accuracy : 0.817027        
##                                           
##        'Positive' Class : 1               
##

Fit Random Forest

ctrl=trainControl(method = "LOOCV")
# Fitting Random Forest to the train dataset
set.seed(1)  # Setting seed
myRF = randomForest(x = train[-1], y = as.factor(train$Bankrupt),
             ntree=100, mtry=3,trControl=ctrl)
myRF

## 
## Call:
##  randomForest(x = train[-1], y = as.factor(train$Bankrupt), ntree = 100,      mtry = 3, trControl = ctrl) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 0%
## Confusion matrix:
##      0    1 class.error
## 0 1000    0           0
## 1    0 3000           0

RF Confusion Matrix and Metrics for Test Set

y_pred = predict(myRF, newdata = test[-1])
confusionMatrix(y_pred, as.factor(test$Bankrupt), positive="1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1802   14
##          1    1    1
##                                           
##                Accuracy : 0.9917          
##                  95% CI : (0.9864, 0.9954)
##     No Information Rate : 0.9917          
##     P-Value [Acc > NIR] : 0.568090        
##                                           
##                   Kappa : 0.1159          
##                                           
##  Mcnemar's Test P-Value : 0.001946        
##                                           
##             Sensitivity : 0.0666667       
##             Specificity : 0.9994454       
##          Pos Pred Value : 0.5000000       
##          Neg Pred Value : 0.9922907       
##              Prevalence : 0.0082508       
##          Detection Rate : 0.0005501       
##    Detection Prevalence : 0.0011001       
##       Balanced Accuracy : 0.5330560       
##                                           
##        'Positive' Class : 1               
##

Fit XGBoost

library(xgboost)

## Warning: package 'xgboost' was built under R version 4.0.5

bst<- xgboost(data = as.matrix(train[,-1]), label = train$Bankrupt, max.depth = 3, eta = .01, nthread = 2, nrounds = 5000, objective = "binary:logistic", verbose=0)

## [09:30:50] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

pred <- predict(bst, as.matrix(test[,-1]))
err <- mean(as.numeric(pred > 0.5) != test$Bankrupt)
err

## [1] 0.006050605

confusionMatrix(as.factor(round(pred,0)), as.factor(test$Bankrupt), positive='1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1799    7
##          1    4    8
##                                          
##                Accuracy : 0.9939         
##                  95% CI : (0.9892, 0.997)
##     No Information Rate : 0.9917         
##     P-Value [Acc > NIR] : 0.1837         
##                                          
##                   Kappa : 0.5896         
##                                          
##  Mcnemar's Test P-Value : 0.5465         
##                                          
##             Sensitivity : 0.533333       
##             Specificity : 0.997781       
##          Pos Pred Value : 0.666667       
##          Neg Pred Value : 0.996124       
##              Prevalence : 0.008251       
##          Detection Rate : 0.004400       
##    Detection Prevalence : 0.006601       
##       Balanced Accuracy : 0.765557       
##                                          
##        'Positive' Class : 1              
##

Produce Classification Plots & Importance Metrics

#xgboost

xgb.plot.tree(model = bst, trees = 1)

importance_matrix =xgb.importance(colnames(train[,-1]), model = bst)
xgb.ggplot.importance(importance_matrix)

# Plotting model
plot(myRF)

# Importance plot
importance(myRF)

##                                 MeanDecreaseGini
## Medical_Center                         1.7104464
## For_Profit                            30.6289000
## Government                             3.6210719
## Sole_Community_Hospital                7.9621004
## Market_Concentration                  26.5916447
## Case_Mix_Index                        63.1194109
## Staffed_Beds                          50.7836766
## Urban                                  0.5140279
## HCAHPS_Recommend                      41.9967584
## Joint_Commission_Certified             3.7460658
## Medicare_Days                         39.1492713
## Medicaid_Days                         27.1875925
## Other_Pay_Days                        17.9879686
## Serious_Complication_Rate             56.7894952
## Bed_Utilization_Rate                  65.6773390
## Facility_Age                          41.5947709
## Affiliated_Physicians                 30.5935444
## Current_Ratio                         85.5537365
## Quick_Ratio                           91.0658275
## Total_Liabilities                    137.2097256
## Total_Assets                         109.1168123
## Cash_on_Hand                          47.6951714
## Days_Cash_on_Hand                     37.6601491
## Net_Patient_Revenue                   81.3756940
## Operating_Income                      41.5810777
## Accounts_Receivable                   39.9215887
## Unreimbursed_Uncompensated_Care      144.5551548
## Debt_Equty_Ratio                      64.6525918
## Labor_Compensation_Ratio              25.7900284
## Net_Operating_Profit_Margin           28.4190374
## Asset_Turnover_Ratio                  57.9490051

# Variable importance plot
varImpPlot(myRF)

newdata=mydata[mydata$Unreimbursed_Uncompensated_Care<.01,]
boxplot(Unreimbursed_Uncompensated_Care~Bankrupt, data=newdata, horizontal=T, col=c('blue','red'), notch=T)

newdata=mydata[mydata$Total_Assets<.1,]
boxplot(Total_Assets~Bankrupt, data=newdata, horizontal=T, col=c('blue','red'), notch=T)

## Warning in bxp(list(stats = structure(c(-92478391, -63979036.5, -17207671, :
## some notches went outside hinges ('box'): maybe set notch=FALSE

Bankruptcy

Fulton

2/21/22

Load Libraries

Read Data

Describe

Convert Data to Numeric

Impute Missing

Split and Scale

Oversample

Undersample

Log Regression

Fit Random Forest

RF Confusion Matrix and Metrics for Test Set

Fit XGBoost

Produce Classification Plots & Importance Metrics