The dataset used is from UCI Machine learning website and represents 10 years (1999-2008) of clinical care at 130 U.S. hospitals. Hospital readmission for diabetic patients is a major concern in the United States. Over $250 million dollars was spent on treatment of readmitted diabetic inpatients in 2011. This disease is chronic and does not have any specific cure. Hospital readmissions are expensive as hospitals face penalties if their readmission rate is higher than expected and reflects the inadequacies in health care system. Most hospitals can agree that their main goals are centered around improving outcomes, creating more satisfied patients, and better value. For these reasons, it is important for the hospitals to improve focus on reducing readmission rates. This study attempts to identify the key factors that influence readmission for diabetes and to predict the probability of patient readmission.
The results obtained by random forest and decision tree on the actual imbalanced data set are the same as previous research on this data set, as evidence towards the predictors available are not encompassing the true cause of readmission. However, I used synthetic data balancing techniques and improved model accuracy
#binData <- getBinaryURL("https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip", ssl.verifypeer=FALSE)
#head(binData)
#conObj <- file("dataset_diabetes.zip", open = "wb") # writing binary file
#writeBin(binData, conObj) #transfer binary data
#close(conObj) # close connection
#files <- unzip("dataset_diabetes.zip")
#diabetes <- read.csv(files[1], stringsAsFactors = FALSE) # file[1] = diabetic_data.csv
#names(diabetes)
#head(diabetes)
#1: remove non useful columns-
#import data if data ingenstion does not work- we have shortned the title of some variables like discharge_disposition_id as discharge_id so that decision trees are better
diabetes <- read.csv("C:/Users/Amit/Dropbox/Dataset/diabetic_data.csv")
diabetes1 <- subset(diabetes,select=-c(encounter_id, patient_nbr, examide,citoglipton,weight, payer_code, medical_spec))
diabetes2 <- diabetes1[diabetes1$race != "?",] # No. of observations drops by 2273
diabetes2 <- diabetes2[diabetes2$diag_1 != "?",] # No of observations drops by 21
diabetes2 <- diabetes2[diabetes2$diag_2 != "?",] # No of observations drops by 358
diabetes2 <- diabetes2[diabetes2$diag_3 != "?",] # No of observations drops by 1453
diabetes2$readmittedbin <- ifelse(diabetes2$readmitted == "<30",1,0)
diabetes3 <- cbind(diabetes2[c(7:13,17)], lapply(diabetes2[c(1:6,14:16,18:44)],factor))
head(diabetes3)
## time_in_hos num_lab_proc num_proc num_medic num_out num_emerg num_inpat
## 2 3 59 0 18 0 0 0
## 3 2 11 5 13 2 0 1
## 4 2 44 1 16 0 0 0
## 5 1 51 0 8 0 0 0
## 6 3 31 6 16 0 0 0
## 7 4 70 1 21 0 0 0
## num_diag race gender age adm_type_id discharge_id
## 2 9 Caucasian Female [10-20) 1 1
## 3 6 AfricanAmerican Female [20-30) 1 1
## 4 7 Caucasian Male [30-40) 1 1
## 5 5 Caucasian Male [40-50) 1 1
## 6 9 Caucasian Male [50-60) 2 1
## 7 7 Caucasian Male [60-70) 3 1
## adm_source_id diag_1 diag_2 diag_3 max_glu_ser A1Cresult metformin
## 2 7 276 250.01 255 None None No
## 3 7 648 250 V27 None None No
## 4 7 8 250.43 403 None None No
## 5 7 197 157 250 None None No
## 6 2 414 411 250 None None No
## 7 2 414 411 V45 None None Steady
## repaglinide nateglinide chlorpropamide glimepiride acetohexamide
## 2 No No No No No
## 3 No No No No No
## 4 No No No No No
## 5 No No No No No
## 6 No No No No No
## 7 No No No Steady No
## glipizide glyburide tolbutamide pioglitazone rosiglitazone acarbose
## 2 No No No No No No
## 3 Steady No No No No No
## 4 No No No No No No
## 5 Steady No No No No No
## 6 No No No No No No
## 7 No No No No No No
## miglitol troglitazone tolazamide insulin glyburide.metformin
## 2 No No No Up No
## 3 No No No No No
## 4 No No No Up No
## 5 No No No Steady No
## 6 No No No Steady No
## 7 No No No Steady No
## glipizide.metformin glimepiride.pioglitazone metformin.rosiglitazone
## 2 No No No
## 3 No No No
## 4 No No No
## 5 No No No
## 6 No No No
## 7 No No No
## metformin.pioglitazone change diabetesMed readmitted readmittedbin
## 2 No Ch Yes >30 0
## 3 No No Yes NO 0
## 4 No Ch Yes NO 0
## 5 No Ch Yes NO 0
## 6 No No Yes >30 0
## 7 No Ch Yes NO 0
table(diabetes3$readmitted)
##
## <30 >30 NO
## 11066 34649 52338
prop.table(table(diabetes3$readmitted))
##
## <30 >30 NO
## 0.1128573 0.3533701 0.5337726
table(diabetes3$readmittedbin)
##
## 0 1
## 86987 11066
prop.table(table(diabetes3$readmittedbin))
##
## 0 1
## 0.8871427 0.1128573
racewise <- table(diabetes3$readmittedbin,diabetes3$race)
racewise
##
## AfricanAmerican Asian Caucasian Hispanic Other
## 0 16741 560 66567 1777 1342
## 1 2140 65 8512 207 142
genderwise <- table(diabetes3$readmittedbin,diabetes3$gender)
genderwise
##
## Female Male Unknown/Invalid
## 0 46831 40155 1
## 1 6002 5064 0
agewise <- table(diabetes3$readmittedbin,diabetes3$age)
agewise
##
## [0-10) [10-20) [20-30) [30-40) [40-50) [50-60) [60-70) [70-80) [80-90)
## 0 64 435 1265 3140 8265 15059 19349 22302 14690
## 1 1 31 213 408 1000 1638 2460 3004 2012
##
## [90-100)
## 0 2418
## 1 299
timinhoswise <- table(diabetes3$readmittedbin,diabetes3$time_in_hos)
timinhoswise
##
## 1 2 3 4 5 6 7 8 9 10 11
## 0 12363 14791 15201 11845 8519 6396 4961 3661 2524 1954 1618
## 1 1127 1650 1848 1589 1180 924 733 615 404 333 191
##
## 12 13 14
## 0 1231 1038 885
## 1 193 147 132
Number_patients <- table(diabetes3$readmitted)
plot(Number_patients,col ="lightblue", xlab = " Readmission Days ", main= " Frequency of Readmission", lwd =20,pch=18)
Number_patients_bin <- table(diabetes3$readmittedbin)
plot(Number_patients_bin,col ="lightblue", xlab = " Readmission Days ", main= " Frequency of Readmission", lwd =20,pch=18)
# split the data into training and testing data sets #### We are using a sample training data as our original sample has 100,000 observations. This large data takes a long time to finish and we face issues with Rmarkdown. Hence we are using a smaller training data (20,000 observations)
set.seed(111)
inTrain <- createDataPartition(diabetes3$readmittedbin, p=.2, list=FALSE)
objTrain <-diabetes3[inTrain,]
objTest <- diabetes3[-inTrain,]
table(objTrain$readmittedbin)
##
## 0 1
## 17398 2214
prop.table(table(objTrain$readmittedbin))
##
## 0 1
## 0.8871099 0.1128901
table(objTest$readmittedbin)
##
## 0 1
## 69589 8852
prop.table(table(objTest$readmittedbin))
##
## 0 1
## 0.8871509 0.1128491
As we see, this data set contains only 11% of cases of readmission within 30 days and 89% of cases with readmission after 30 days or No readmission. This is a severely imbalanced data set.
Let us see how badly can this affect our prediction accuracy ? Let’s build a model on this data. I’ll be using decision tree algorithm for modeling purpose. #Prediction with three levels of response variable
cfit <- rpart(readmitted ~ time_in_hos + num_lab_proc + num_proc + num_medic + num_out + num_emerg + num_inpat + race + age + adm_type_id + discharge_id + adm_source_id + num_diag + max_glu_ser + A1Cresult + metformin + insulin, data = objTrain, method="class", minsplit = 20, minbucket = 5, cp = 0.001)
head(predict(cfit))
## <30 >30 NO
## 7 0.08252352 0.3054913 0.6119852
## 10 0.08252352 0.3054913 0.6119852
## 19 0.08252352 0.3054913 0.6119852
## 33 0.08252352 0.3054913 0.6119852
## 37 0.08252352 0.3054913 0.6119852
## 39 0.08252352 0.3054913 0.6119852
par(mar=c(1,1,0.25,1))
plot(cfit, branch = 0.4,uniform = TRUE, compress = TRUE)
text(cfit, pretty = 0)
rpart.predict <- predict(cfit, newdata = objTrain, type="class")
tail(rpart.predict)
## 101736 101744 101746 101748 101753 101757
## >30 NO NO NO NO >30
## Levels: <30 >30 NO
#accuracy.meas(objTest$readmitted, rpart.predict[,2])
cf <-confusionMatrix(rpart.predict, objTrain$readmitted)
cf
## Confusion Matrix and Statistics
##
## Reference
## Prediction <30 >30 NO
## <30 131 85 28
## >30 771 2328 1470
## NO 1312 4513 8974
##
## Overall Statistics
##
## Accuracy : 0.583
## 95% CI : (0.576, 0.5899)
## No Information Rate : 0.534
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.1877
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: <30 Class: >30 Class: NO
## Sensitivity 0.05917 0.3361 0.8570
## Specificity 0.99351 0.8233 0.3627
## Pos Pred Value 0.53689 0.5095 0.6064
## Neg Pred Value 0.89245 0.6943 0.6888
## Prevalence 0.11289 0.3532 0.5340
## Detection Rate 0.00668 0.1187 0.4576
## Detection Prevalence 0.01244 0.2330 0.7546
## Balanced Accuracy 0.52634 0.5797 0.6098
#Mean error rate
mean.error.rate.rpart <- 1- cf$overall[1]
mean.error.rate.rpart
## Accuracy
## 0.4170406
par(mar=c(3,3,3,3))
plotcp(cfit,lty = 3, col = 1)
printcp(cfit)
##
## Classification tree:
## rpart(formula = readmitted ~ time_in_hos + num_lab_proc + num_proc +
## num_medic + num_out + num_emerg + num_inpat + race + age +
## adm_type_id + discharge_id + adm_source_id + num_diag + max_glu_ser +
## A1Cresult + metformin + insulin, data = objTrain, method = "class",
## minsplit = 20, minbucket = 5, cp = 0.001)
##
## Variables actually used in tree construction:
## [1] adm_source_id adm_type_id age discharge_id insulin
## [6] num_diag num_inpat num_lab_proc num_medic num_proc
##
## Root node error: 9140/19612 = 0.46604
##
## n= 19612
##
## CP nsplit rel error xerror xstd
## 1 0.0468271 0 1.00000 1.00000 0.0076433
## 2 0.0203501 1 0.95317 0.95317 0.0076132
## 3 0.0051422 2 0.93282 0.93315 0.0075957
## 4 0.0014442 5 0.91740 0.92987 0.0075926
## 5 0.0014223 12 0.90613 0.92987 0.0075926
## 6 0.0012035 13 0.90470 0.92921 0.0075920
## 7 0.0011488 15 0.90230 0.92834 0.0075912
## 8 0.0010284 17 0.90000 0.92746 0.0075903
## 9 0.0010000 22 0.89486 0.92681 0.0075897
# The plot shows 3 branches as optimum. The CP value for 3 branches would be around 0.0014. Pruning and replotting the tree:
# Cross-validation of the tree
cfit.tree <- tree(readmitted ~ time_in_hos + num_lab_proc + num_proc + num_medic + num_out + num_emerg + num_inpat + race + age + adm_type_id + discharge_id + adm_source_id + num_diag + max_glu_ser + A1Cresult + metformin + insulin, data = objTrain, method="class")
cv.cfit.tree <- cv.tree(cfit.tree, FUN = prune.misclass)
cv.cfit.tree
## $size
## [1] 2 1
##
## $dev
## [1] 8712 9140
##
## $k
## [1] -Inf 428
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
prune.cfit.tree <- prune.misclass(cfit.tree, best = 4)
## Warning in prune.tree(tree = cfit.tree, best = 4, method = "misclass"):
## best is bigger than tree size
#plot(prune.cfit.tree)
text(prune.cfit.tree, pretty = 0)
#After pruning
cfit2 = prune(cfit, cp = 0.0014)
par(mar=c(1,1,0.25,1))
plot(cfit2, branch = 0.4,uniform = TRUE, compress = TRUE)
text(cfit2)
#Using the pruned tree to predict and pulling up the mean error rate and confusion matrix
#Prediction on test set
rpart.prune.predict <- predict(cfit2, newdata = objTest,type = "class")
cf.prune <-confusionMatrix(rpart.prune.predict,objTest$readmitted)
#Mean error rate
mean.error.rate.rpart.prune <- 1- cf.prune$overall[1]
mean.error.rate.rpart.prune
## Accuracy
## 0.4312923
cf.prune$table
## Reference
## Prediction <30 >30 NO
## <30 8 37 42
## >30 3628 9735 6957
## NO 5216 17951 34867
cfit_bin <- rpart(readmittedbin ~ time_in_hos + num_lab_proc + num_proc + num_medic + num_out + num_emerg + num_inpat + race + age + adm_type_id + discharge_id + adm_source_id + num_diag + max_glu_ser + A1Cresult + metformin + insulin, data = objTrain, method="class", minsplit = 1, minbucket = 1, cp = 0.001)
par(mar=c(2,2,0.25,1))
plot(cfit_bin, branch = 0.4,uniform = TRUE, compress = TRUE)
text(cfit_bin, pretty = 0)
#How to read plotcp - http://www.wekaleamstudios.co.uk/posts/classification-trees-using-the-rpart-function/#m3mLNpeke0I
rpart.predict_bin <- predict(cfit_bin, newdata = objTrain,type="prob")
View(objTrain)
head(rpart.predict_bin)
## 0 1
## 7 0.9122884 0.08771157
## 10 0.9122884 0.08771157
## 19 0.9122884 0.08771157
## 33 0.9122884 0.08771157
## 37 0.9122884 0.08771157
## 39 0.9122884 0.08771157
View(rpart.predict_bin)
accuracy.meas(objTrain$readmittedbin, rpart.predict_bin[,2])
##
## Call:
## accuracy.meas(response = objTrain$readmittedbin, predicted = rpart.predict_bin[,
## 2])
##
## Examples are labelled as positive when predicted is greater than 0.5
##
## precision: 0.801
## recall: 0.062
## F: 0.057
roc.curve(objTrain$readmittedbin, rpart.predict_bin[,2], plotit = T)
## Area under the curve (AUC): 0.610
par =TRUE
AUC = 0.61 is a terribly low score. Therefore, it is necessary to balanced data before applying a machine learning algorithm. In this case, the algorithm gets biased toward the majority class and fails to map minority class. I have used the sampling techniques and try to improve this prediction accuracy.
str(rpart.predict_bin)
## num [1:19612, 1:2] 0.912 0.912 0.912 0.912 0.912 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:19612] "7" "10" "19" "33" ...
## ..$ : chr [1:2] "0" "1"
str(objTrain$readmittedbin)
## Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
#cf_bin <-confusionMatrix(rpart.predict_bin, objTrain$readmittedbin)
#cf_bin
#Mean error rate
#mean.error.rate.rpart_bin <- 1- cf_bin$overall[1]
#mean.error.rate.rpart_bin
#par(mar=c(3,3,3,3))
#plotcp(cfit_bin,lty = 3, col = 1)
#printcp(cfit_bin)
#After pruning
cfit2_bin = prune(cfit_bin, cp = 0.0001)
par(mar=c(.5,.5,.5,.5))
plot(cfit2_bin, branch = 0.4,uniform = TRUE, compress = TRUE)
text(cfit2_bin, pretty=0)
head(predict(cfit2_bin))
## 0 1
## 7 0.9122884 0.08771157
## 10 0.9122884 0.08771157
## 19 0.9122884 0.08771157
## 33 0.9122884 0.08771157
## 37 0.9122884 0.08771157
## 39 0.9122884 0.08771157
#Prediction on training set
rpart.prune.predict2_bin <- predict(cfit2_bin, newdata = objTrain,type = "class")
cf.prune_bin <-confusionMatrix(rpart.prune.predict2_bin,objTrain$readmittedbin)
cf.prune_bin
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 17364 2077
## 1 34 137
##
## Accuracy : 0.8924
## 95% CI : (0.8879, 0.8967)
## No Information Rate : 0.8871
## P-Value [Acc > NIR] : 0.01001
##
## Kappa : 0.1003
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.99805
## Specificity : 0.06188
## Pos Pred Value : 0.89316
## Neg Pred Value : 0.80117
## Prevalence : 0.88711
## Detection Rate : 0.88538
## Detection Prevalence : 0.99128
## Balanced Accuracy : 0.52996
##
## 'Positive' Class : 0
##
#Mean error rate
mean.error.rate.rpart.prune2 <- 1- cf.prune_bin$overall[1]
mean.error.rate.rpart.prune2
## Accuracy
## 0.1076382
table(objTrain$readmittedbin)
##
## 0 1
## 17398 2214
data_balanced_over <- ovun.sample(readmittedbin ~ time_in_hos + num_lab_proc + num_proc + num_medic + num_out + num_emerg + num_inpat + race + age + adm_type_id + discharge_id + adm_source_id + num_diag + max_glu_ser + A1Cresult + metformin + insulin, data = objTrain, method = "over", N = 34794)$data
table(data_balanced_over$readmittedbin)
##
## 0 1
## 17398 17396
data_balanced_under <- ovun.sample(readmittedbin ~ time_in_hos + num_lab_proc + num_proc + num_medic + num_out + num_emerg + num_inpat + race + age + adm_type_id + discharge_id + adm_source_id + num_diag + max_glu_ser + A1Cresult + metformin + insulin, data = objTrain, method = "under", N = 4428, seed=1)$data
table(data_balanced_under$readmittedbin)
##
## 0 1
## 2214 2214
data_balanced_both <- ovun.sample(readmittedbin ~ time_in_hos + num_lab_proc + num_proc + num_medic + num_out + num_emerg + num_inpat + race + age + adm_type_id + discharge_id + adm_source_id + num_diag + max_glu_ser + A1Cresult + metformin + insulin, data = objTrain, method = "both", N = 19610, seed=1)$data
table(data_balanced_both$readmittedbin)
##
## 0 1
## 9847 9763
ROSE helps us to generate data synthetically as well. The data generated using ROSE is considered to provide better estimate of original data.
data.rose <- ROSE(readmittedbin ~ time_in_hos + num_lab_proc + num_proc + num_medic + num_out + num_emerg + num_inpat + race + age + adm_type_id + discharge_id + adm_source_id + num_diag + max_glu_ser + A1Cresult + metformin + insulin, data = objTrain,seed=1)$data
table(data.rose$readmittedbin)
##
## 0 1
## 9848 9764
cfit.rose <- rpart(readmittedbin ~ time_in_hos + num_lab_proc + num_proc + num_medic + num_out + num_emerg + num_inpat + race + age + adm_type_id + discharge_id + adm_source_id + num_diag + max_glu_ser + A1Cresult + metformin + insulin, data = data.rose)
head(data.rose)
## time_in_hos num_lab_proc num_proc num_medic num_out num_emerg
## 1 3.875275 15.260395 0.5041033 8.912528 1.1967522 -0.003483168
## 2 11.566065 59.596280 -0.1071821 19.516326 -0.2592942 0.591731331
## 3 3.825353 1.079877 -0.9974357 3.793863 -0.1146781 0.946548770
## 4 4.756819 38.079734 1.3753153 28.513673 1.0952137 0.199230920
## 5 1.423249 40.854999 1.4328905 7.907884 0.6224822 0.183480966
## 6 2.379349 23.891914 1.8367864 16.530931 -0.6984490 0.063265616
## num_inpat num_diag race age adm_type_id discharge_id
## 1 0.8878647 7.493924 Caucasian [40-50) 1 1
## 2 3.1756714 6.979433 AfricanAmerican [70-80) 1 18
## 3 -0.4678477 4.832296 Caucasian [60-70) 1 1
## 4 -0.1664868 9.073524 AfricanAmerican [40-50) 3 2
## 5 1.0278402 9.767368 AfricanAmerican [70-80) 3 6
## 6 0.2439110 4.901822 AfricanAmerican [70-80) 2 3
## adm_source_id max_glu_ser A1Cresult metformin insulin readmittedbin
## 1 7 None None No Steady 0
## 2 7 None None No No 0
## 3 7 None None No No 0
## 4 1 None None No No 0
## 5 1 None None No Steady 0
## 6 1 None None No Steady 0
rpart.predict.rose <- predict(cfit.rose, newdata = data.rose)
par(2,2,2,2)
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
#roc.curve(data.rose$readmittedbin, rpart.predict.rose[,2], col= redblue(10000), add =TRUE)
par =TRUE
#Prediction on rose set
rpart.prune.predict3_bin <- predict(cfit.rose, newdata = data.rose,type = "class")
cf.prune_bin <-confusionMatrix(rpart.prune.predict3_bin,objTrain$readmittedbin)
cf.prune_bin
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7733 1017
## 1 9665 1197
##
## Accuracy : 0.4553
## 95% CI : (0.4483, 0.4623)
## No Information Rate : 0.8871
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0055
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.4445
## Specificity : 0.5407
## Pos Pred Value : 0.8838
## Neg Pred Value : 0.1102
## Prevalence : 0.8871
## Detection Rate : 0.3943
## Detection Prevalence : 0.4462
## Balanced Accuracy : 0.4926
##
## 'Positive' Class : 0
##
#Mean error rate
mean.error.rate.rpart.prune2 <- 1- cf.prune_bin$overall[1]
mean.error.rate.rpart.prune2
## Accuracy
## 0.5446665
cfit.over <- rpart(readmittedbin ~ time_in_hos + num_lab_proc + num_proc + num_medic + num_out + num_emerg + num_inpat + race + age + adm_type_id + discharge_id + adm_source_id + num_diag + max_glu_ser + A1Cresult + metformin + insulin, data = data_balanced_over)
rpart.predict.over <- predict(cfit.over, newdata = data_balanced_over)
#plot(, colorize = TRUE)
#plot(perf2, add = TRUE, colorize = TRUE)
#roc.curve(data_balanced_over$readmittedbin, rpart.predict.over[,2], add =TRUE, col = greenred(2) )
#str(rpart.predict.over)
#confusionMatrix(data_balanced_over$readmittedbin, rpart.predict.over[,2])
cfit.under <- rpart(readmittedbin ~ time_in_hos + num_lab_proc + num_proc + num_medic + num_out + num_emerg + num_inpat + race + age + adm_type_id + discharge_id + adm_source_id + num_diag + max_glu_ser + A1Cresult + metformin + insulin, data = data_balanced_under)
rpart.predict.under <- predict(cfit.over, newdata = data_balanced_under)
par(new=TRUE)
## Warning in par(new = TRUE): calling par(new=TRUE) with no plot
#roc.curve(data_balanced_under$readmittedbin, rpart.predict.under[,2], add =TRUE, col = bluered(2))
cfit.both <- rpart(readmittedbin ~ time_in_hos + num_lab_proc + num_proc + num_medic + num_out + num_emerg + num_inpat + race + age + adm_type_id + discharge_id + adm_source_id + num_diag + max_glu_ser + A1Cresult + metformin + insulin, data = data_balanced_both)
rpart.predict.both <- predict(cfit.both, newdata = data_balanced_both)
#roc.curve(data_balanced_both$readmittedbin, rpart.predict.both[,2], add =TRUE, col = redblue(5))
# ROC curve comparison
img1 <- rasterGrob(as.raster(readPNG("C:/Users/Amit/Desktop/DA6813/Project/Final/ROC curve comparison.png")), interpolate = FALSE)
#img1
grid.arrange(img1,ncol = 1)
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.3.2
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
rf.diabetes_bin <- randomForest(readmittedbin ~ time_in_hos + num_lab_proc + num_proc + num_medic + num_out + num_emerg + num_inpat + race + age + adm_type_id + discharge_id + adm_source_id + num_diag + max_glu_ser + A1Cresult + metformin + insulin, data = objTrain,importance=TRUE)
rf.diabetes_bin
##
## Call:
## randomForest(formula = readmittedbin ~ time_in_hos + num_lab_proc + num_proc + num_medic + num_out + num_emerg + num_inpat + race + age + adm_type_id + discharge_id + adm_source_id + num_diag + max_glu_ser + A1Cresult + metformin + insulin, data = objTrain, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 11.31%
## Confusion matrix:
## 0 1 class.error
## 0 17340 58 0.003333717
## 1 2160 54 0.975609756
rf.predict_bin <- predict(rf.diabetes_bin,newdata =objTest)
#Plotting the errors from Random Forest model:
par(mar=c(3,3,3,3))
plot(rf.diabetes_bin, type="l")
varImpPlot(rf.diabetes_bin,main = "Important Variables")
importance(rf.diabetes_bin)
## 0 1 MeanDecreaseAccuracy MeanDecreaseGini
## time_in_hos 30.388885 -9.8969499 26.510312 341.28179
## num_lab_proc 27.665440 -10.0887440 23.281411 599.02434
## num_proc 18.360897 -7.8626549 14.909038 240.60511
## num_medic 27.343215 -7.0345684 24.714419 516.65157
## num_out 6.280604 -0.0935255 5.995584 124.98060
## num_emerg 10.386787 6.0944550 12.197632 99.86427
## num_inpat 28.123125 39.3783559 40.445917 230.37618
## race 5.493411 -1.0518489 4.966326 114.22172
## age 14.262086 -3.2689006 12.047534 288.47120
## adm_type_id 31.959289 -10.6238770 28.858850 163.61962
## discharge_id 38.593537 18.7142693 43.531328 262.42699
## adm_source_id 37.431836 -14.9614893 33.920329 148.73872
## num_diag 19.031800 -7.1933292 16.220604 212.77271
## max_glu_ser 11.172379 -2.2896010 10.704692 37.20528
## A1Cresult 8.736507 -2.0640434 7.774281 104.19680
## metformin 3.776961 -1.8232015 2.954200 78.97969
## insulin 10.224336 1.2757770 10.380395 176.79835
# Confusion Matrix and the mean error rate:
rf.cm_bin <- confusionMatrix(rf.predict_bin,objTest$readmittedbin)
rf.cm_bin$table
## Reference
## Prediction 0 1
## 0 69380 8699
## 1 209 153
#Mean error rate
mean.error.rate.rf <- (1- rf.cm_bin$overall[1])
mean.error.rate.rf
## Accuracy
## 0.1135631
Random on three class response variable
library(randomForest)
rf.diabetes <- randomForest(readmitted ~ time_in_hos + num_lab_proc + num_proc + num_medic + num_out + num_emerg + num_inpat + race + age + adm_type_id + discharge_id + adm_source_id + num_diag + max_glu_ser + A1Cresult + metformin + insulin, data = objTrain,importance=TRUE)
rf.diabetes
##
## Call:
## randomForest(formula = readmitted ~ time_in_hos + num_lab_proc + num_proc + num_medic + num_out + num_emerg + num_inpat + race + age + adm_type_id + discharge_id + adm_source_id + num_diag + max_glu_ser + A1Cresult + metformin + insulin, data = objTrain, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 43.4%
## Confusion matrix:
## <30 >30 NO class.error
## <30 79 916 1219 0.9643180
## >30 72 2706 4148 0.6092983
## NO 51 2106 8315 0.2059778
rf.predict <- predict(rf.diabetes,newdata =objTest)
#Plotting the errors from Random Forest model:
par(mar=c(3,3,3,3))
plot(rf.diabetes, type="l")
varImpPlot(rf.diabetes,main = "Important Variables")
importance(rf.diabetes)
## <30 >30 NO MeanDecreaseAccuracy
## time_in_hos -2.5103516 -1.9087006 22.6435602 17.776989
## num_lab_proc -2.6666105 -6.0545499 30.1804795 20.510687
## num_proc -3.0319221 0.8995026 24.3840286 21.174606
## num_medic -1.2175113 0.6825077 21.2041241 17.632025
## num_out -0.7306174 4.6210256 19.0669299 16.574863
## num_emerg 10.1746854 -1.3574304 34.3195131 27.161628
## num_inpat 35.9344650 16.9973581 77.4435013 81.298202
## race 0.6177442 1.9515527 4.1947964 4.765272
## age -1.6466524 1.4748083 21.2182238 16.849210
## adm_type_id -3.0084656 -10.2017747 35.9439677 28.683265
## discharge_id 18.6391925 20.5503522 48.7661765 56.662992
## adm_source_id -6.7147868 -8.4674467 43.1289239 39.953983
## num_diag -1.0577076 4.2466616 26.9470841 24.287593
## max_glu_ser -0.9574368 0.7692303 8.6246984 8.062100
## A1Cresult -0.2730523 3.5216448 8.2785308 8.655071
## metformin -0.1475653 -1.3762764 -0.7313216 -1.432654
## insulin 2.8548323 1.2797640 11.4799624 10.152148
## MeanDecreaseGini
## time_in_hos 1032.7647
## num_lab_proc 1693.0244
## num_proc 667.4413
## num_medic 1449.7031
## num_out 281.2987
## num_emerg 206.6344
## num_inpat 566.9028
## race 338.2419
## age 859.5430
## adm_type_id 442.4881
## discharge_id 797.7079
## adm_source_id 389.7562
## num_diag 626.0575
## max_glu_ser 100.1906
## A1Cresult 323.3854
## metformin 265.0518
## insulin 541.8741
# Confusion Matrix and the mean error rate:
rf.cm <- confusionMatrix(rf.predict,objTest$readmitted)
rf.cm$table
## Reference
## Prediction <30 >30 NO
## <30 301 313 193
## >30 3595 11017 8456
## NO 4956 16393 33217
#Mean error rate
mean.error.rate.rf <- (1- rf.cm$overall[1])
# This gives error rate
mean.error.rate.rf
## Accuracy
## 0.4322484
#library(gbm)
#gbm.diabetes_bin <- gbm(readmittedbin ~ time_in_hos + num_lab_proc + num_proc + num_medic + num_out + num_emerg + num_inpat + race + age + adm_type_id + discharge_id + adm_source_id + num_diag + max_glu_ser + A1Cresult + metformin + insulin, data = objTrain, distribution = "bernoulli",n.trees =100,interaction.depth = 4, shrinkage = 0.005)
# Most important variables:
#par(mar=c(2,2,2,2))
#gbm.diabetes_bin
#par(mar=c(4,4,4,4))
#summary(gbm.diabetes_bin, xlim =0.1, ylim= 2)
#gbm.yhat <- predict(gbm.diabetes_bin,newdata = objTrain, n.trees = 200)
#head(gbm.yhat)
#gbm.yhat1 <- ifelse(gbm.yhat < 0.385,0,1)
#library(caret)
#gbm.cm <- confusionMatrix(gbm.yhat,objTest$readmittedbin)
#Mean error rate
#mean.error.rate.gbm <- 1- gbm.cm$overall[1]
#mean.error.rate.gbm