This chunk reads in the data, relevels factors, and prints a summary.
# Loading data
readmission <- read.csv(file="readmission.csv")
vars <- colnames(readmission)[c(2,3,5,9)] #variables to relevel
for (i in vars){
table <- as.data.frame(table(readmission[,i]))
max <- which.max(table[,2])
level.name <- as.character(table[max,1])
readmission[,i] <- relevel(readmission[,i], ref = level.name)
}
summary(readmission)
## Readmission.Status Gender Race ER
## Min. :0.0000 F:38011 White :56124 Min. :0.0000
## 1st Qu.:0.0000 M:28771 Black : 7099 1st Qu.:0.0000
## Median :0.0000 Hispanic: 1286 Median :0.0000
## Mean :0.1259 Others : 2273 Mean :0.5083
## 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :9.0000
## DRG.Class LOS Age HCC.Riskscore
## MED :35771 Min. : 1.000 Min. : 24.00 Min. : 0.079
## SURG :30447 1st Qu.: 3.000 1st Qu.: 67.00 1st Qu.: 1.107
## UNGROUP: 564 Median : 5.000 Median : 75.00 Median : 1.865
## Mean : 6.693 Mean : 73.64 Mean : 2.345
## 3rd Qu.: 8.000 3rd Qu.: 83.00 3rd Qu.: 3.173
## Max. :36.000 Max. :101.00 Max. :12.307
## DRG.Complication
## MedicalMCC.CC:18110
## MedicalNoC :12310
## Other : 9345
## SurgMCC.CC :15468
## SurgNoC :11549
##
##Task 1 I have elected to make a table for ER and histograms for the other three variables.
library(ggplot2)
table(readmission$ER)
##
## 0 1 2 3 4 5 6 7 9
## 43086 16280 5286 1572 438 105 10 3 2
ggplot(readmission,aes(x=LOS))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(readmission,aes(x=Age))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(readmission,aes(x=HCC.Riskscore))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
I will use log transformations of the LOS and HCC.Riskscore variables. I also create an indicator variables for those under age 65. The following chunk does these.
#two log transforms and removal of original variables
readmission$logLOS <- log(readmission$LOS)
readmission$logRiskscore <- log(readmission$HCC.Riskscore)
readmission$LOS <- NULL
readmission$HCC.Riskscore <- NULL
readmission$Under65 <- ifelse(readmission$Age < 65, 1, 0)
summary(readmission)
## Readmission.Status Gender Race ER
## Min. :0.0000 F:38011 White :56124 Min. :0.0000
## 1st Qu.:0.0000 M:28771 Black : 7099 1st Qu.:0.0000
## Median :0.0000 Hispanic: 1286 Median :0.0000
## Mean :0.1259 Others : 2273 Mean :0.5083
## 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :9.0000
## DRG.Class Age DRG.Complication logLOS
## MED :35771 Min. : 24.00 MedicalMCC.CC:18110 Min. :0.000
## SURG :30447 1st Qu.: 67.00 MedicalNoC :12310 1st Qu.:1.099
## UNGROUP: 564 Median : 75.00 Other : 9345 Median :1.609
## Mean : 73.64 SurgMCC.CC :15468 Mean :1.653
## 3rd Qu.: 83.00 SurgNoC :11549 3rd Qu.:2.079
## Max. :101.00 Max. :3.584
## logRiskscore Under65
## Min. :-2.5383 Min. :0.0000
## 1st Qu.: 0.1017 1st Qu.:0.0000
## Median : 0.6235 Median :0.0000
## Mean : 0.5999 Mean :0.1684
## 3rd Qu.: 1.1547 3rd Qu.:0.0000
## Max. : 2.5102 Max. :1.0000
##Task 2 This chunk creates a tabular view of the two variables.
table(readmission$DRG.Class,readmission$DRG.Complication)
##
## MedicalMCC.CC MedicalNoC Other SurgMCC.CC SurgNoC
## MED 18104 12310 5357 0 0
## SURG 6 0 3424 15468 11549
## UNGROUP 0 0 564 0 0
Six items will be deleted and the two existing variables recoded as a single factor variable. The following code does that.
readmission.new <- readmission #preserve the original data until the work can be checked
readmission.new <- readmission.new[!(readmission.new$DRG.Complication=="MedicalMCC.CC" & readmission.new$DRG.Class=="SURG"),]
readmission.new$DRG <- ifelse(readmission.new$DRG.Complication=="MedicalMCC.CC","Med.C",
ifelse(readmission.new$DRG.Complication=="MedicalNoC","Med.NoC",
ifelse(readmission.new$DRG.Complication=="SurgMCC.CC","Surg.C",
ifelse(readmission.new$DRG.Complication=="SurgNoC","Surg.NoC",
ifelse(readmission.new$DRG.Class=="UNGROUP","UNGROUP",
ifelse(readmission.new$DRG.Complication=="Other"&readmission.new$DRG.Class=="MED","OtherMED","OtherSURG"))))))
readmission.new$DRG <- as.factor(readmission.new$DRG)
table(readmission.new$DRG)
##
## Med.C Med.NoC OtherMED OtherSURG Surg.C Surg.NoC UNGROUP
## 18104 12310 5357 3424 15468 11549 564
readmission.new$DRG.Class <- NULL
readmission.new$DRG.Complication <- NULL
Relevel the new variable.
table <- as.data.frame(table(readmission.new[,"DRG"]))
max <- which.max(table[,2])
level.name <- as.character(table[max,1])
readmission.new[,"DRG"] <- relevel(readmission.new[,"DRG"], ref = level.name)
table(readmission.new$DRG)
##
## Med.C Med.NoC OtherMED OtherSURG Surg.C Surg.NoC UNGROUP
## 18104 12310 5357 3424 15468 11549 564
Accept the new dataframe by renaming it back to readmission.
readmission <- readmission.new
readmission.new <- NULL
summary(readmission)
## Readmission.Status Gender Race ER
## Min. :0.0000 F:38005 White :56120 Min. :0.0000
## 1st Qu.:0.0000 M:28771 Black : 7097 1st Qu.:0.0000
## Median :0.0000 Hispanic: 1286 Median :0.0000
## Mean :0.1259 Others : 2273 Mean :0.5083
## 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :9.0000
##
## Age logLOS logRiskscore Under65
## Min. : 24.00 Min. :0.000 Min. :-2.5383 Min. :0.0000
## 1st Qu.: 67.00 1st Qu.:1.099 1st Qu.: 0.1017 1st Qu.:0.0000
## Median : 75.00 Median :1.609 Median : 0.6238 Median :0.0000
## Mean : 73.64 Mean :1.653 Mean : 0.6000 Mean :0.1684
## 3rd Qu.: 83.00 3rd Qu.:2.079 3rd Qu.: 1.1547 3rd Qu.:0.0000
## Max. :101.00 Max. :3.584 Max. : 2.5102 Max. :1.0000
##
## DRG
## Med.C :18104
## Med.NoC :12310
## OtherMED : 5357
## OtherSURG: 3424
## Surg.C :15468
## Surg.NoC :11549
## UNGROUP : 564
##Task 3 This code performs cluster analysis for from 1 to 12 clusters and constructs an elbow plot.
nstart.val <- 20
cluster_vars <- readmission[c('logLOS','Age')]
for(i in 1:ncol(cluster_vars)){
cluster_vars[,i] <- scale(cluster_vars[,i])
}
km1 <- kmeans(cluster_vars,centers=1,nstart=nstart.val)
km2 <- kmeans(cluster_vars,centers=2,nstart=nstart.val)
km3 <- kmeans(cluster_vars,centers=3,nstart=nstart.val)
km4 <- kmeans(cluster_vars,centers=4,nstart=nstart.val)
km5 <- kmeans(cluster_vars,centers=5,nstart=nstart.val)
km6 <- kmeans(cluster_vars,centers=6,nstart=nstart.val)
km7 <- kmeans(cluster_vars,centers=7,nstart=nstart.val)
km8 <- kmeans(cluster_vars,centers=8,nstart=nstart.val)
km9 <- kmeans(cluster_vars,centers=9,nstart=nstart.val)
km10 <- kmeans(cluster_vars,centers=10,nstart=nstart.val)
km11 <- kmeans(cluster_vars,centers=11,nstart=nstart.val)
km12 <- kmeans(cluster_vars,centers=12,nstart=nstart.val)
var.exp <- data.frame(k = c(1:12),
bss_tss = c(km1$betweenss/km1$totss,
km2$betweenss/km2$totss,
km3$betweenss/km3$totss,
km4$betweenss/km4$totss,
km5$betweenss/km5$totss,
km6$betweenss/km6$totss,
km7$betweenss/km7$totss,
km8$betweenss/km8$totss,
km9$betweenss/km9$totss,
km10$betweenss/km10$totss,
km11$betweenss/km11$totss,
km12$betweenss/km12$totss))
ggplot(var.exp,aes(x=k,y=bss_tss))+geom_point()
The following chunk creates the new variable based on 5 clusters.
LOS_Age_Clust <- as.factor(km5$cluster) #This creates a new variable based on having 5 clusters.
cluster_vars$LOS_Age_Clust <- LOS_Age_Clust
ggplot(data = cluster_vars, aes(x = Age, y = logLOS, col = LOS_Age_Clust)) + geom_point() + theme(axis.text = element_blank(), legend.title = element_blank()) +ggtitle("Clustering with 5 groups")
##Task 4
My first look is at ER and logRiskscore.
ggplot(readmission,aes(x=factor(Readmission.Status),y=logRiskscore)) + geom_boxplot() +facet_wrap(~factor(ER))
Trying again with Race and Gender.
ggplot(readmission,aes(Gender,fill=factor(Readmission.Status))) + geom_bar(position = "fill") +
facet_wrap(~Race,ncol=2,scales="free")+scale_y_continuous()+ylab("percent")
##Task 5
Before fitting a GLM, I will split the data into training and test sets.
#Create train and test sets
library(caret)
## Loading required package: lattice
set.seed(4321)
partition <- createDataPartition(readmission[,1], list = FALSE, p = .75) #The partition will stratify using variable 1 from the dataframe
train <- readmission[partition, ]
test <- readmission[-partition, ]
print("TRAIN")
## [1] "TRAIN"
mean(train$Readmission.Status)
## [1] 0.1252346
print("TEST")
## [1] "TEST"
mean(test$Readmission.Status)
## [1] 0.1280101
I will now run a glm using the binomial distribution and logit link and add the desired interaction.
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
glmlogit <- glm(Readmission.Status ~ . + Gender*Race, data=train, family = binomial(link="logit"))
summary(glmlogit)
##
## Call:
## glm(formula = Readmission.Status ~ . + Gender * Race, family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3401 -0.5625 -0.3926 -0.2590 3.5158
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.630481 0.137493 -19.132 < 2e-16 ***
## GenderM 0.002919 0.031255 0.093 0.9256
## RaceBlack -0.013201 0.060223 -0.219 0.8265
## RaceHispanic 0.072077 0.132982 0.542 0.5878
## RaceOthers -0.104211 0.111986 -0.931 0.3521
## ER 0.004022 0.017275 0.233 0.8159
## Age -0.006665 0.001624 -4.105 4.05e-05 ***
## logLOS 0.052063 0.020491 2.541 0.0111 *
## logRiskscore 1.302494 0.023641 55.094 < 2e-16 ***
## Under65 -0.045725 0.057877 -0.790 0.4295
## DRGMed.NoC -0.069340 0.042810 -1.620 0.1053
## DRGOtherMED 0.142270 0.054853 2.594 0.0095 **
## DRGOtherSURG 0.025573 0.067532 0.379 0.7049
## DRGSurg.C -0.032400 0.040172 -0.807 0.4199
## DRGSurg.NoC 0.017780 0.043439 0.409 0.6823
## DRGUNGROUP -0.014643 0.142768 -0.103 0.9183
## GenderM:RaceBlack 0.113370 0.090355 1.255 0.2096
## GenderM:RaceHispanic 0.068279 0.205629 0.332 0.7399
## GenderM:RaceOthers 0.154444 0.159098 0.971 0.3317
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37785 on 50081 degrees of freedom
## Residual deviance: 33818 on 50063 degrees of freedom
## AIC: 33856
##
## Number of Fisher Scoring iterations: 5
predslogit <- predict(glmlogit,newdat=test,type="response")
roclogit <- roc(test$Readmission.Status,predslogit)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
confusionMatrix(factor(1*(predslogit>.5)),factor(test$Readmission.Status))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 14551 2132
## 1 6 5
##
## Accuracy : 0.8719
## 95% CI : (0.8668, 0.877)
## No Information Rate : 0.872
## P-Value [Acc > NIR] : 0.515
##
## Kappa : 0.0033
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.99959
## Specificity : 0.00234
## Pos Pred Value : 0.87221
## Neg Pred Value : 0.45455
## Prevalence : 0.87199
## Detection Rate : 0.87163
## Detection Prevalence : 0.99934
## Balanced Accuracy : 0.50096
##
## 'Positive' Class : 0
##
plot(roclogit)
auc(roclogit)
## Area under the curve: 0.7443
The same code is now run with the probit link.
library(pROC)
glmprobit <- glm(Readmission.Status ~ . + Gender*Race, data=train, family = binomial(link="probit"))
summary(glmprobit)
##
## Call:
## glm(formula = Readmission.Status ~ . + Gender * Race, family = binomial(link = "probit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2610 -0.5734 -0.3966 -0.2449 3.9581
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4498310 0.0742169 -19.535 < 2e-16 ***
## GenderM 0.0017917 0.0168690 0.106 0.9154
## RaceBlack -0.0097589 0.0325639 -0.300 0.7644
## RaceHispanic 0.0334025 0.0725431 0.460 0.6452
## RaceOthers -0.0546917 0.0596048 -0.918 0.3588
## ER 0.0035282 0.0093310 0.378 0.7053
## Age -0.0040958 0.0008794 -4.658 3.2e-06 ***
## logLOS 0.0265354 0.0112425 2.360 0.0183 *
## logRiskscore 0.6936871 0.0123968 55.957 < 2e-16 ***
## Under65 -0.0301577 0.0313121 -0.963 0.3355
## DRGMed.NoC -0.0378245 0.0230854 -1.638 0.1013
## DRGOtherMED 0.0766117 0.0299537 2.558 0.0105 *
## DRGOtherSURG 0.0164803 0.0367285 0.449 0.6536
## DRGSurg.C -0.0112814 0.0216294 -0.522 0.6020
## DRGSurg.NoC 0.0119932 0.0234597 0.511 0.6092
## DRGUNGROUP -0.0121150 0.0786184 -0.154 0.8775
## GenderM:RaceBlack 0.0675905 0.0492164 1.373 0.1696
## GenderM:RaceHispanic 0.0547836 0.1119560 0.489 0.6246
## GenderM:RaceOthers 0.0688073 0.0857200 0.803 0.4221
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37785 on 50081 degrees of freedom
## Residual deviance: 33802 on 50063 degrees of freedom
## AIC: 33840
##
## Number of Fisher Scoring iterations: 5
predsprobit <- predict(glmprobit,newdat=test,type="response")
rocprobit <- roc(test$Readmission.Status,predsprobit)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
confusionMatrix(factor(1*(predsprobit>.5)),factor(test$Readmission.Status))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 14555 2136
## 1 2 1
##
## Accuracy : 0.8719
## 95% CI : (0.8668, 0.877)
## No Information Rate : 0.872
## P-Value [Acc > NIR] : 0.515
##
## Kappa : 6e-04
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9998626
## Specificity : 0.0004679
## Pos Pred Value : 0.8720268
## Neg Pred Value : 0.3333333
## Prevalence : 0.8719899
## Detection Rate : 0.8718701
## Detection Prevalence : 0.9998203
## Balanced Accuracy : 0.5001653
##
## 'Positive' Class : 0
##
plot(rocprobit)
auc(rocprobit)
## Area under the curve: 0.7444
##Task 6
A new dataframe is created that drops logLOS and Age and adds LOS_Age_Clust. It must then be partitioned.
readmission.cluster <- readmission
readmission.cluster$Age <- NULL
readmission.cluster$logLOS <- NULL
readmission.cluster$LOS_Age_Clust <- LOS_Age_Clust
table(readmission.cluster$LOS_Age_Clust)
##
## 1 2 3 4 5
## 14274 7604 18346 10754 15798
train.cluster <- readmission.cluster[partition, ]
test.cluster <- readmission.cluster[-partition, ]
This code reruns the probit regression using the cluster variable.
library(pROC)
glmprobit.clust <- glm(Readmission.Status ~ . + Gender*Race, data=train.cluster, family = binomial(link="probit"))
summary(glmprobit.clust)
##
## Call:
## glm(formula = Readmission.Status ~ . + Gender * Race, family = binomial(link = "probit"),
## data = train.cluster)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2236 -0.5731 -0.3978 -0.2453 4.0018
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.736969 0.025371 -68.462 < 2e-16 ***
## GenderM 0.006032 0.016858 0.358 0.72050
## RaceBlack -0.003182 0.032523 -0.098 0.92207
## RaceHispanic 0.036379 0.072499 0.502 0.61582
## RaceOthers -0.048185 0.059557 -0.809 0.41848
## ER 0.003631 0.009328 0.389 0.69710
## logRiskscore 0.688514 0.012289 56.026 < 2e-16 ***
## Under65 0.087803 0.032869 2.671 0.00755 **
## DRGMed.NoC -0.037807 0.023080 -1.638 0.10140
## DRGOtherMED 0.075491 0.029944 2.521 0.01170 *
## DRGOtherSURG 0.017360 0.036717 0.473 0.63635
## DRGSurg.C -0.011607 0.021625 -0.537 0.59145
## DRGSurg.NoC 0.011960 0.023454 0.510 0.61009
## DRGUNGROUP -0.018299 0.078695 -0.233 0.81612
## LOS_Age_Clust2 -0.004442 0.041913 -0.106 0.91559
## LOS_Age_Clust3 0.011884 0.023157 0.513 0.60781
## LOS_Age_Clust4 0.070173 0.025051 2.801 0.00509 **
## LOS_Age_Clust5 -0.016639 0.023753 -0.700 0.48362
## GenderM:RaceBlack 0.065427 0.049198 1.330 0.18356
## GenderM:RaceHispanic 0.059008 0.111872 0.527 0.59787
## GenderM:RaceOthers 0.065261 0.085671 0.762 0.44621
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37785 on 50081 degrees of freedom
## Residual deviance: 33814 on 50061 degrees of freedom
## AIC: 33856
##
## Number of Fisher Scoring iterations: 5
predsprobit.clust <- predict(glmprobit.clust,newdat=test.cluster,type="response")
rocprobit.clust <- roc(test.cluster$Readmission.Status,predsprobit.clust)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
confusionMatrix(factor(1*(predsprobit.clust>.5)),factor(test.cluster$Readmission.Status))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 14554 2135
## 1 3 2
##
## Accuracy : 0.8719
## 95% CI : (0.8668, 0.877)
## No Information Rate : 0.872
## P-Value [Acc > NIR] : 0.515
##
## Kappa : 0.0013
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9997939
## Specificity : 0.0009359
## Pos Pred Value : 0.8720714
## Neg Pred Value : 0.4000000
## Prevalence : 0.8719899
## Detection Rate : 0.8718102
## Detection Prevalence : 0.9997005
## Balanced Accuracy : 0.5003649
##
## 'Positive' Class : 0
##
plot(rocprobit.clust)
auc(rocprobit.clust)
## Area under the curve: 0.7434
##Task 7
The first step is to binarize the factor levels that have more than two levels. fullRank is set to FALSE so that all levels get binarized. The interaction variable is also a factor variable and must be binarized as well. In each case the base level will then need to be deleted. I do it this way becasue if fullRank is TRUE I can’t be sure about which level becomes the base.
#Add the interaction variable to the data frame
readmission$RaceGender <- factor(paste0(readmission$Race,readmission$Gender))
summary(readmission$RaceGender)
## BlackF BlackM HispanicF HispanicM OthersF OthersM WhiteF
## 4157 2940 751 535 1206 1067 31891
## WhiteM
## 24229
factor_names <- c("Race","DRG","RaceGender")
factor_vars <- readmission[,factor_names]
for (var in factor_names) {
factor_vars[, var] <- as.character(factor_vars[, var])
}
binarizer <- caret::dummyVars(paste("~", paste(factor_names, collapse = "+")) , data = factor_vars, fullRank = FALSE)
binarized_vars <- data.frame(predict(binarizer, newdata = factor_vars))
head(binarized_vars)
## RaceBlack RaceHispanic RaceOthers RaceWhite DRGMed.C DRGMed.NoC
## 1 0 0 0 1 0 0
## 2 0 0 0 1 0 0
## 3 0 0 0 1 0 0
## 4 0 0 0 1 0 1
## 5 0 0 0 1 0 1
## 6 0 0 0 1 0 0
## DRGOtherMED DRGOtherSURG DRGSurg.C DRGSurg.NoC DRGUNGROUP
## 1 1 0 0 0 0
## 2 0 0 0 1 0
## 3 0 0 0 1 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 6 0 0 1 0 0
## RaceGenderBlackF RaceGenderBlackM RaceGenderHispanicF
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## RaceGenderHispanicM RaceGenderOthersF RaceGenderOthersM RaceGenderWhiteF
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## RaceGenderWhiteM
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
Now delete the three base variables.
binarized_vars$RaceWhite <- NULL
binarized_vars$DRGMed.C <- NULL
binarized_vars$RaceGenderWhiteF <- NULL
head(binarized_vars)
## RaceBlack RaceHispanic RaceOthers DRGMed.NoC DRGOtherMED DRGOtherSURG
## 1 0 0 0 0 1 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 1 0 0
## 5 0 0 0 1 0 0
## 6 0 0 0 0 0 0
## DRGSurg.C DRGSurg.NoC DRGUNGROUP RaceGenderBlackF RaceGenderBlackM
## 1 0 0 0 0 0
## 2 0 1 0 0 0
## 3 0 1 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 6 1 0 0 0 0
## RaceGenderHispanicF RaceGenderHispanicM RaceGenderOthersF
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## RaceGenderOthersM RaceGenderWhiteM
## 1 0 1
## 2 0 1
## 3 0 1
## 4 0 1
## 5 0 1
## 6 0 1
I now attach the binarized variables and remove the three original factor variables. A new dataframe is created so the old one is preserved.
readmission.bin <- cbind(readmission,binarized_vars)
readmission.bin$DRG <- NULL
readmission.bin$Race <- NULL
readmission.bin$RaceGender <- NULL
summary(readmission.bin)
## Readmission.Status Gender ER Age
## Min. :0.0000 F:38005 Min. :0.0000 Min. : 24.00
## 1st Qu.:0.0000 M:28771 1st Qu.:0.0000 1st Qu.: 67.00
## Median :0.0000 Median :0.0000 Median : 75.00
## Mean :0.1259 Mean :0.5083 Mean : 73.64
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.: 83.00
## Max. :1.0000 Max. :9.0000 Max. :101.00
## logLOS logRiskscore Under65 RaceBlack
## Min. :0.000 Min. :-2.5383 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.099 1st Qu.: 0.1017 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.609 Median : 0.6238 Median :0.0000 Median :0.0000
## Mean :1.653 Mean : 0.6000 Mean :0.1684 Mean :0.1063
## 3rd Qu.:2.079 3rd Qu.: 1.1547 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :3.584 Max. : 2.5102 Max. :1.0000 Max. :1.0000
## RaceHispanic RaceOthers DRGMed.NoC DRGOtherMED
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.0000 Median :0.00000
## Mean :0.01926 Mean :0.03404 Mean :0.1843 Mean :0.08022
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.00000
## DRGOtherSURG DRGSurg.C DRGSurg.NoC DRGUNGROUP
## Min. :0.00000 Min. :0.0000 Min. :0.000 Min. :0.000000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.000000
## Median :0.00000 Median :0.0000 Median :0.000 Median :0.000000
## Mean :0.05128 Mean :0.2316 Mean :0.173 Mean :0.008446
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.000 3rd Qu.:0.000000
## Max. :1.00000 Max. :1.0000 Max. :1.000 Max. :1.000000
## RaceGenderBlackF RaceGenderBlackM RaceGenderHispanicF
## Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.06225 Mean :0.04403 Mean :0.01125
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000
## RaceGenderHispanicM RaceGenderOthersF RaceGenderOthersM RaceGenderWhiteM
## Min. :0.000000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.000000 Median :0.00000 Median :0.00000 Median :0.0000
## Mean :0.008012 Mean :0.01806 Mean :0.01598 Mean :0.3628
## 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.000000 Max. :1.00000 Max. :1.00000 Max. :1.0000
I need to again split the data into train and test sets.
train.bin <- readmission.bin[partition, ]
test.bin <- readmission.bin[-partition, ]
I next run the GLM on the binarized data. In doing so, I recognized that the interaction variable created combinations that were redundant. I need to remove all the interactions with male and then re-partition the data.
readmission.bin$RaceGenderOthersM <- NULL
readmission.bin$RaceGenderWhiteM <- NULL
readmission.bin$RaceGenderHispanicM <- NULL
readmission.bin$RaceGenderBlackM <- NULL
summary(readmission.bin)
## Readmission.Status Gender ER Age
## Min. :0.0000 F:38005 Min. :0.0000 Min. : 24.00
## 1st Qu.:0.0000 M:28771 1st Qu.:0.0000 1st Qu.: 67.00
## Median :0.0000 Median :0.0000 Median : 75.00
## Mean :0.1259 Mean :0.5083 Mean : 73.64
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.: 83.00
## Max. :1.0000 Max. :9.0000 Max. :101.00
## logLOS logRiskscore Under65 RaceBlack
## Min. :0.000 Min. :-2.5383 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.099 1st Qu.: 0.1017 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.609 Median : 0.6238 Median :0.0000 Median :0.0000
## Mean :1.653 Mean : 0.6000 Mean :0.1684 Mean :0.1063
## 3rd Qu.:2.079 3rd Qu.: 1.1547 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :3.584 Max. : 2.5102 Max. :1.0000 Max. :1.0000
## RaceHispanic RaceOthers DRGMed.NoC DRGOtherMED
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.0000 Median :0.00000
## Mean :0.01926 Mean :0.03404 Mean :0.1843 Mean :0.08022
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.00000
## DRGOtherSURG DRGSurg.C DRGSurg.NoC DRGUNGROUP
## Min. :0.00000 Min. :0.0000 Min. :0.000 Min. :0.000000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.000000
## Median :0.00000 Median :0.0000 Median :0.000 Median :0.000000
## Mean :0.05128 Mean :0.2316 Mean :0.173 Mean :0.008446
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.000 3rd Qu.:0.000000
## Max. :1.00000 Max. :1.0000 Max. :1.000 Max. :1.000000
## RaceGenderBlackF RaceGenderHispanicF RaceGenderOthersF
## Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.06225 Mean :0.01125 Mean :0.01806
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000
train.bin <- readmission.bin[partition, ]
test.bin <- readmission.bin[-partition, ]
glmprobit <- glm(Readmission.Status ~ . , data=train.bin, family = binomial(link="probit"))
summary(glmprobit)
##
## Call:
## glm(formula = Readmission.Status ~ ., family = binomial(link = "probit"),
## data = train.bin)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2610 -0.5734 -0.3966 -0.2449 3.9581
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4498310 0.0742169 -19.535 < 2e-16 ***
## GenderM 0.0017917 0.0168690 0.106 0.9154
## ER 0.0035282 0.0093310 0.378 0.7053
## Age -0.0040958 0.0008794 -4.658 3.2e-06 ***
## logLOS 0.0265354 0.0112425 2.360 0.0183 *
## logRiskscore 0.6936871 0.0123968 55.957 < 2e-16 ***
## Under65 -0.0301577 0.0313121 -0.963 0.3355
## RaceBlack 0.0578316 0.0373272 1.549 0.1213
## RaceHispanic 0.0881861 0.0853691 1.033 0.3016
## RaceOthers 0.0141156 0.0616693 0.229 0.8190
## DRGMed.NoC -0.0378245 0.0230854 -1.638 0.1013
## DRGOtherMED 0.0766117 0.0299537 2.558 0.0105 *
## DRGOtherSURG 0.0164803 0.0367285 0.449 0.6536
## DRGSurg.C -0.0112814 0.0216294 -0.522 0.6020
## DRGSurg.NoC 0.0119932 0.0234597 0.511 0.6092
## DRGUNGROUP -0.0121150 0.0786184 -0.154 0.8775
## RaceGenderBlackF -0.0675905 0.0492164 -1.373 0.1696
## RaceGenderHispanicF -0.0547836 0.1119560 -0.489 0.6246
## RaceGenderOthersF -0.0688073 0.0857200 -0.803 0.4221
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37785 on 50081 degrees of freedom
## Residual deviance: 33802 on 50063 degrees of freedom
## AIC: 33840
##
## Number of Fisher Scoring iterations: 5
predsprobit <- predict(glmprobit,newdat=test.bin,type="response")
rocprobit <- roc(test.bin$Readmission.Status,predsprobit)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
confusionMatrix(factor(1*(predsprobit>.5)),factor(test.bin$Readmission.Status))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 14555 2136
## 1 2 1
##
## Accuracy : 0.8719
## 95% CI : (0.8668, 0.877)
## No Information Rate : 0.872
## P-Value [Acc > NIR] : 0.515
##
## Kappa : 6e-04
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9998626
## Specificity : 0.0004679
## Pos Pred Value : 0.8720268
## Neg Pred Value : 0.3333333
## Prevalence : 0.8719899
## Detection Rate : 0.8718701
## Detection Prevalence : 0.9998203
## Balanced Accuracy : 0.5001653
##
## 'Positive' Class : 0
##
plot(rocprobit)
auc(rocprobit)
## Area under the curve: 0.7444
The next step is to run stepAIC on this model.
library(MASS)
stepAIC(glmprobit)
## Start: AIC=33839.56
## Readmission.Status ~ Gender + ER + Age + logLOS + logRiskscore +
## Under65 + RaceBlack + RaceHispanic + RaceOthers + DRGMed.NoC +
## DRGOtherMED + DRGOtherSURG + DRGSurg.C + DRGSurg.NoC + DRGUNGROUP +
## RaceGenderBlackF + RaceGenderHispanicF + RaceGenderOthersF
##
## Df Deviance AIC
## - Gender 1 33802 33838
## - DRGUNGROUP 1 33802 33838
## - RaceOthers 1 33802 33838
## - ER 1 33802 33838
## - DRGOtherSURG 1 33802 33838
## - RaceGenderHispanicF 1 33802 33838
## - DRGSurg.NoC 1 33802 33838
## - DRGSurg.C 1 33802 33838
## - RaceGenderOthersF 1 33802 33838
## - Under65 1 33802 33838
## - RaceHispanic 1 33803 33839
## - RaceGenderBlackF 1 33803 33839
## <none> 33802 33840
## - RaceBlack 1 33804 33840
## - DRGMed.NoC 1 33804 33840
## - logLOS 1 33807 33843
## - DRGOtherMED 1 33808 33844
## - Age 1 33823 33859
## - logRiskscore 1 37448 37484
##
## Step: AIC=33837.58
## Readmission.Status ~ ER + Age + logLOS + logRiskscore + Under65 +
## RaceBlack + RaceHispanic + RaceOthers + DRGMed.NoC + DRGOtherMED +
## DRGOtherSURG + DRGSurg.C + DRGSurg.NoC + DRGUNGROUP + RaceGenderBlackF +
## RaceGenderHispanicF + RaceGenderOthersF
##
## Df Deviance AIC
## - DRGUNGROUP 1 33802 33836
## - RaceOthers 1 33802 33836
## - ER 1 33802 33836
## - DRGOtherSURG 1 33802 33836
## - DRGSurg.NoC 1 33802 33836
## - RaceGenderHispanicF 1 33802 33836
## - DRGSurg.C 1 33802 33836
## - RaceGenderOthersF 1 33802 33836
## - Under65 1 33803 33837
## - RaceHispanic 1 33803 33837
## <none> 33802 33838
## - RaceGenderBlackF 1 33804 33838
## - RaceBlack 1 33804 33838
## - DRGMed.NoC 1 33804 33838
## - logLOS 1 33807 33841
## - DRGOtherMED 1 33808 33842
## - Age 1 33823 33857
## - logRiskscore 1 37448 37482
##
## Step: AIC=33835.6
## Readmission.Status ~ ER + Age + logLOS + logRiskscore + Under65 +
## RaceBlack + RaceHispanic + RaceOthers + DRGMed.NoC + DRGOtherMED +
## DRGOtherSURG + DRGSurg.C + DRGSurg.NoC + RaceGenderBlackF +
## RaceGenderHispanicF + RaceGenderOthersF
##
## Df Deviance AIC
## - RaceOthers 1 33802 33834
## - ER 1 33802 33834
## - DRGOtherSURG 1 33802 33834
## - DRGSurg.C 1 33802 33834
## - RaceGenderHispanicF 1 33802 33834
## - DRGSurg.NoC 1 33802 33834
## - RaceGenderOthersF 1 33802 33834
## - Under65 1 33803 33835
## - RaceHispanic 1 33803 33835
## <none> 33802 33836
## - RaceGenderBlackF 1 33804 33836
## - RaceBlack 1 33804 33836
## - DRGMed.NoC 1 33804 33836
## - logLOS 1 33807 33839
## - DRGOtherMED 1 33808 33840
## - Age 1 33824 33856
## - logRiskscore 1 37449 37481
##
## Step: AIC=33833.66
## Readmission.Status ~ ER + Age + logLOS + logRiskscore + Under65 +
## RaceBlack + RaceHispanic + DRGMed.NoC + DRGOtherMED + DRGOtherSURG +
## DRGSurg.C + DRGSurg.NoC + RaceGenderBlackF + RaceGenderHispanicF +
## RaceGenderOthersF
##
## Df Deviance AIC
## - ER 1 33802 33832
## - DRGOtherSURG 1 33802 33832
## - DRGSurg.C 1 33802 33832
## - RaceGenderHispanicF 1 33802 33832
## - DRGSurg.NoC 1 33802 33832
## - RaceGenderOthersF 1 33803 33833
## - Under65 1 33803 33833
## - RaceHispanic 1 33803 33833
## <none> 33802 33834
## - RaceGenderBlackF 1 33804 33834
## - RaceBlack 1 33804 33834
## - DRGMed.NoC 1 33804 33834
## - logLOS 1 33807 33837
## - DRGOtherMED 1 33808 33838
## - Age 1 33824 33854
## - logRiskscore 1 37449 37479
##
## Step: AIC=33831.8
## Readmission.Status ~ Age + logLOS + logRiskscore + Under65 +
## RaceBlack + RaceHispanic + DRGMed.NoC + DRGOtherMED + DRGOtherSURG +
## DRGSurg.C + DRGSurg.NoC + RaceGenderBlackF + RaceGenderHispanicF +
## RaceGenderOthersF
##
## Df Deviance AIC
## - DRGOtherSURG 1 33802 33830
## - DRGSurg.C 1 33802 33830
## - RaceGenderHispanicF 1 33802 33830
## - DRGSurg.NoC 1 33802 33830
## - RaceGenderOthersF 1 33803 33831
## - Under65 1 33803 33831
## - RaceHispanic 1 33803 33831
## <none> 33802 33832
## - RaceGenderBlackF 1 33804 33832
## - RaceBlack 1 33804 33832
## - DRGMed.NoC 1 33804 33832
## - logLOS 1 33807 33835
## - DRGOtherMED 1 33808 33836
## - Age 1 33824 33852
## - logRiskscore 1 37450 37478
##
## Step: AIC=33830.01
## Readmission.Status ~ Age + logLOS + logRiskscore + Under65 +
## RaceBlack + RaceHispanic + DRGMed.NoC + DRGOtherMED + DRGSurg.C +
## DRGSurg.NoC + RaceGenderBlackF + RaceGenderHispanicF + RaceGenderOthersF
##
## Df Deviance AIC
## - DRGSurg.NoC 1 33802 33828
## - RaceGenderHispanicF 1 33802 33828
## - DRGSurg.C 1 33802 33828
## - RaceGenderOthersF 1 33803 33829
## - Under65 1 33803 33829
## - RaceHispanic 1 33803 33829
## <none> 33802 33830
## - RaceGenderBlackF 1 33804 33830
## - RaceBlack 1 33805 33831
## - DRGMed.NoC 1 33805 33831
## - logLOS 1 33808 33834
## - DRGOtherMED 1 33808 33834
## - Age 1 33824 33850
## - logRiskscore 1 37450 37476
##
## Step: AIC=33828.2
## Readmission.Status ~ Age + logLOS + logRiskscore + Under65 +
## RaceBlack + RaceHispanic + DRGMed.NoC + DRGOtherMED + DRGSurg.C +
## RaceGenderBlackF + RaceGenderHispanicF + RaceGenderOthersF
##
## Df Deviance AIC
## - RaceGenderHispanicF 1 33802 33826
## - DRGSurg.C 1 33803 33827
## - RaceGenderOthersF 1 33803 33827
## - Under65 1 33803 33827
## - RaceHispanic 1 33803 33827
## <none> 33802 33828
## - RaceGenderBlackF 1 33804 33828
## - RaceBlack 1 33805 33829
## - DRGMed.NoC 1 33807 33831
## - logLOS 1 33808 33832
## - DRGOtherMED 1 33808 33832
## - Age 1 33824 33848
## - logRiskscore 1 37450 37474
##
## Step: AIC=33826.46
## Readmission.Status ~ Age + logLOS + logRiskscore + Under65 +
## RaceBlack + RaceHispanic + DRGMed.NoC + DRGOtherMED + DRGSurg.C +
## RaceGenderBlackF + RaceGenderOthersF
##
## Df Deviance AIC
## - DRGSurg.C 1 33803 33825
## - RaceGenderOthersF 1 33803 33825
## - Under65 1 33803 33825
## - RaceHispanic 1 33803 33825
## <none> 33802 33826
## - RaceGenderBlackF 1 33805 33827
## - RaceBlack 1 33805 33827
## - DRGMed.NoC 1 33807 33829
## - logLOS 1 33808 33830
## - DRGOtherMED 1 33809 33831
## - Age 1 33825 33847
## - logRiskscore 1 37450 37472
##
## Step: AIC=33825.22
## Readmission.Status ~ Age + logLOS + logRiskscore + Under65 +
## RaceBlack + RaceHispanic + DRGMed.NoC + DRGOtherMED + RaceGenderBlackF +
## RaceGenderOthersF
##
## Df Deviance AIC
## - RaceGenderOthersF 1 33804 33824
## - Under65 1 33804 33824
## - RaceHispanic 1 33804 33824
## <none> 33803 33825
## - RaceGenderBlackF 1 33805 33825
## - RaceBlack 1 33806 33826
## - DRGMed.NoC 1 33807 33827
## - logLOS 1 33809 33829
## - DRGOtherMED 1 33811 33831
## - Age 1 33825 33845
## - logRiskscore 1 37453 37473
##
## Step: AIC=33824.13
## Readmission.Status ~ Age + logLOS + logRiskscore + Under65 +
## RaceBlack + RaceHispanic + DRGMed.NoC + DRGOtherMED + RaceGenderBlackF
##
## Df Deviance AIC
## - Under65 1 33805 33823
## - RaceHispanic 1 33805 33823
## <none> 33804 33824
## - RaceGenderBlackF 1 33806 33824
## - RaceBlack 1 33807 33825
## - DRGMed.NoC 1 33808 33826
## - logLOS 1 33810 33828
## - DRGOtherMED 1 33812 33830
## - Age 1 33826 33844
## - logRiskscore 1 37454 37472
##
## Step: AIC=33823.07
## Readmission.Status ~ Age + logLOS + logRiskscore + RaceBlack +
## RaceHispanic + DRGMed.NoC + DRGOtherMED + RaceGenderBlackF
##
## Df Deviance AIC
## - RaceHispanic 1 33806 33822
## <none> 33805 33823
## - RaceGenderBlackF 1 33807 33823
## - RaceBlack 1 33808 33824
## - DRGMed.NoC 1 33809 33825
## - logLOS 1 33811 33827
## - DRGOtherMED 1 33813 33829
## - Age 1 33841 33857
## - logRiskscore 1 37461 37477
##
## Step: AIC=33822.06
## Readmission.Status ~ Age + logLOS + logRiskscore + RaceBlack +
## DRGMed.NoC + DRGOtherMED + RaceGenderBlackF
##
## Df Deviance AIC
## <none> 33806 33822
## - RaceGenderBlackF 1 33808 33822
## - RaceBlack 1 33808 33822
## - DRGMed.NoC 1 33810 33824
## - logLOS 1 33812 33826
## - DRGOtherMED 1 33814 33828
## - Age 1 33843 33857
## - logRiskscore 1 37462 37476
##
## Call: glm(formula = Readmission.Status ~ Age + logLOS + logRiskscore +
## RaceBlack + DRGMed.NoC + DRGOtherMED + RaceGenderBlackF,
## family = binomial(link = "probit"), data = train.bin)
##
## Coefficients:
## (Intercept) Age logLOS logRiskscore
## -1.496292 -0.003491 0.026865 0.693134
## RaceBlack DRGMed.NoC DRGOtherMED RaceGenderBlackF
## 0.056380 -0.038102 0.076249 -0.068861
##
## Degrees of Freedom: 50081 Total (i.e. Null); 50074 Residual
## Null Deviance: 37780
## Residual Deviance: 33810 AIC: 33820
The probit model is now run using the five surviving variables.
glmprobit <- glm(Readmission.Status ~ logLOS + Age + logRiskscore + DRGOtherSURG + DRGOtherMED, data=train.bin, family = binomial(link="probit"))
summary(glmprobit)
##
## Call:
## glm(formula = Readmission.Status ~ logLOS + Age + logRiskscore +
## DRGOtherSURG + DRGOtherMED, family = binomial(link = "probit"),
## data = train.bin)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2665 -0.5741 -0.3967 -0.2453 3.9514
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4957724 0.0470444 -31.795 < 2e-16 ***
## logLOS 0.0262253 0.0112344 2.334 0.01958 *
## Age -0.0035797 0.0005678 -6.304 2.89e-10 ***
## logRiskscore 0.6931615 0.0123750 56.013 < 2e-16 ***
## DRGOtherSURG 0.0254804 0.0346815 0.735 0.46252
## DRGOtherMED 0.0852455 0.0274126 3.110 0.00187 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37785 on 50081 degrees of freedom
## Residual deviance: 33812 on 50076 degrees of freedom
## AIC: 33824
##
## Number of Fisher Scoring iterations: 5
predsprobit <- predict(glmprobit,newdat=test.bin,type="response")
rocprobit <- roc(test.bin$Readmission.Status,predsprobit)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
confusionMatrix(factor(1*(predsprobit>.5)),factor(test.bin$Readmission.Status))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 14554 2134
## 1 3 3
##
## Accuracy : 0.872
## 95% CI : (0.8668, 0.877)
## No Information Rate : 0.872
## P-Value [Acc > NIR] : 0.5058
##
## Kappa : 0.0021
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.999794
## Specificity : 0.001404
## Pos Pred Value : 0.872124
## Neg Pred Value : 0.500000
## Prevalence : 0.871990
## Detection Rate : 0.871810
## Detection Prevalence : 0.999641
## Balanced Accuracy : 0.500599
##
## 'Positive' Class : 0
##
plot(rocprobit)
auc(rocprobit)
## Area under the curve: 0.7451
##Task 8
I first run the model on the full dataset.
glmprobit <- glm(Readmission.Status ~ logLOS + Age + logRiskscore + DRGOtherSURG + DRGOtherMED, data=readmission.bin, family = binomial(link="probit"))
glmprobit2 <- glm(Readmission.Status ~ logLOS + Age + logRiskscore + DRGMed.NoC + DRGOtherMED, data=readmission.bin, family = binomial(link="probit"))
summary(glmprobit)
##
## Call:
## glm(formula = Readmission.Status ~ logLOS + Age + logRiskscore +
## DRGOtherSURG + DRGOtherMED, family = binomial(link = "probit"),
## data = readmission.bin)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2680 -0.5747 -0.3952 -0.2421 3.9745
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.5026235 0.0407530 -36.871 < 2e-16 ***
## logLOS 0.0329948 0.0097365 3.389 0.000702 ***
## Age -0.0036999 0.0004916 -7.526 5.25e-14 ***
## logRiskscore 0.7008229 0.0107283 65.325 < 2e-16 ***
## DRGOtherSURG 0.0560715 0.0293707 1.909 0.056250 .
## DRGOtherMED 0.0691546 0.0237996 2.906 0.003664 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 50559 on 66775 degrees of freedom
## Residual deviance: 45116 on 66770 degrees of freedom
## AIC: 45128
##
## Number of Fisher Scoring iterations: 5
summary(glmprobit2)
##
## Call:
## glm(formula = Readmission.Status ~ logLOS + Age + logRiskscore +
## DRGMed.NoC + DRGOtherMED, family = binomial(link = "probit"),
## data = readmission.bin)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2683 -0.5749 -0.3953 -0.2418 3.9690
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4961959 0.0408457 -36.630 < 2e-16 ***
## logLOS 0.0336166 0.0097341 3.454 0.000553 ***
## Age -0.0037051 0.0004916 -7.536 4.84e-14 ***
## logRiskscore 0.7010516 0.0107302 65.334 < 2e-16 ***
## DRGMed.NoC -0.0202730 0.0172126 -1.178 0.238878
## DRGOtherMED 0.0617021 0.0239867 2.572 0.010101 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 50559 on 66775 degrees of freedom
## Residual deviance: 45118 on 66770 degrees of freedom
## AIC: 45130
##
## Number of Fisher Scoring iterations: 5
I now create some arbitrary patients to learn how changing the values affects the predicted probability of readmission.
new.data <- data.frame("logLOS" = c(log(5),log(6),log(5),log(5),log(5),log(5)), "Age" = c(75,75,80,75,75,75), "logRiskscore" = c(log(1.866),log(1.866),log(1.866),log(2.053),log(1.866),log(1.866)), "DRGOtherSURG" = c(0,0,0,0,1,0), "DRGOtherMED" = c(0,0,0,0,0,1))
new.data
## logLOS Age logRiskscore DRGOtherSURG DRGOtherMED
## 1 1.609438 75 0.6237971 0 0
## 2 1.791759 75 0.6237971 0 0
## 3 1.609438 80 0.6237971 0 0
## 4 1.609438 75 0.7193021 0 0
## 5 1.609438 75 0.6237971 1 0
## 6 1.609438 75 0.6237971 0 1
predict(glmprobit, newdat = new.data, type = "response")
## 1 2 3 4 5 6
## 0.09855331 0.09960192 0.09537932 0.11068247 0.10864481 0.11110281
##Task 9
I rerun the model on the full dataset.
glmprobit <- glm(Readmission.Status ~ logLOS + Age + logRiskscore + DRGOtherSURG + DRGOtherMED, data=readmission.bin, family = binomial(link="probit"))
summary(glmprobit)
##
## Call:
## glm(formula = Readmission.Status ~ logLOS + Age + logRiskscore +
## DRGOtherSURG + DRGOtherMED, family = binomial(link = "probit"),
## data = readmission.bin)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2680 -0.5747 -0.3952 -0.2421 3.9745
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.5026235 0.0407530 -36.871 < 2e-16 ***
## logLOS 0.0329948 0.0097365 3.389 0.000702 ***
## Age -0.0036999 0.0004916 -7.526 5.25e-14 ***
## logRiskscore 0.7008229 0.0107283 65.325 < 2e-16 ***
## DRGOtherSURG 0.0560715 0.0293707 1.909 0.056250 .
## DRGOtherMED 0.0691546 0.0237996 2.906 0.003664 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 50559 on 66775 degrees of freedom
## Residual deviance: 45116 on 66770 degrees of freedom
## AIC: 45128
##
## Number of Fisher Scoring iterations: 5
predsprobit <- predict(glmprobit,newdat=readmission.bin,type="response")
confusionMatrix(factor(1*(predsprobit>.5)),factor(readmission.bin$Readmission.Status))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 58356 8399
## 1 11 10
##
## Accuracy : 0.8741
## 95% CI : (0.8715, 0.8766)
## No Information Rate : 0.8741
## P-Value [Acc > NIR] : 0.5076
##
## Kappa : 0.0017
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.999812
## Specificity : 0.001189
## Pos Pred Value : 0.874182
## Neg Pred Value : 0.476190
## Prevalence : 0.874072
## Detection Rate : 0.873907
## Detection Prevalence : 0.999686
## Balanced Accuracy : 0.500500
##
## 'Positive' Class : 0
##
This code calculates the cost for different cutoff values. I ran it at different values and am presting the final choice of 0.08.
cutoff <- 0.08
pred_readmit <- 1*(predsprobit > cutoff)
cm <- confusionMatrix(factor(pred_readmit),factor(readmission.bin$Readmission.Status))
no_intervention_cost <- 25*sum(readmission.bin$Readmission.Status == 1)
full_intervention_cost <- 2*nrow(readmission.bin)
modified_cost <- cm$table[2,1]*2+cm$table[2,2]*2+cm$table[1,2]*25
no_intervention_cost
## [1] 210225
full_intervention_cost
## [1] 133552
modified_cost
## [1] 106002
The final step is to get the confusion matrix.
cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 26379 1096
## 1 31988 7313
##
## Accuracy : 0.5046
## 95% CI : (0.5008, 0.5084)
## No Information Rate : 0.8741
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.125
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.4520
## Specificity : 0.8697
## Pos Pred Value : 0.9601
## Neg Pred Value : 0.1861
## Prevalence : 0.8741
## Detection Rate : 0.3950
## Detection Prevalence : 0.4115
## Balanced Accuracy : 0.6608
##
## 'Positive' Class : 0
##
##ALTERNATIVE FEATURE SELECTION
This section presents an alternative approach. It is based on using hypothesis tests to sequentially remove features. It begins by re-running the probit model on the training set using all the available features.
glmprobit <- glm(Readmission.Status ~ . + Race*Gender, data=train, family = binomial(link="probit"))
summary(glmprobit)
##
## Call:
## glm(formula = Readmission.Status ~ . + Race * Gender, family = binomial(link = "probit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2610 -0.5734 -0.3966 -0.2449 3.9581
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4498310 0.0742169 -19.535 < 2e-16 ***
## GenderM 0.0017917 0.0168690 0.106 0.9154
## RaceBlack -0.0097589 0.0325639 -0.300 0.7644
## RaceHispanic 0.0334025 0.0725431 0.460 0.6452
## RaceOthers -0.0546917 0.0596048 -0.918 0.3588
## ER 0.0035282 0.0093310 0.378 0.7053
## Age -0.0040958 0.0008794 -4.658 3.2e-06 ***
## logLOS 0.0265354 0.0112425 2.360 0.0183 *
## logRiskscore 0.6936871 0.0123968 55.957 < 2e-16 ***
## Under65 -0.0301577 0.0313121 -0.963 0.3355
## DRGMed.NoC -0.0378245 0.0230854 -1.638 0.1013
## DRGOtherMED 0.0766117 0.0299537 2.558 0.0105 *
## DRGOtherSURG 0.0164803 0.0367285 0.449 0.6536
## DRGSurg.C -0.0112814 0.0216294 -0.522 0.6020
## DRGSurg.NoC 0.0119932 0.0234597 0.511 0.6092
## DRGUNGROUP -0.0121150 0.0786184 -0.154 0.8775
## GenderM:RaceBlack 0.0675905 0.0492164 1.373 0.1696
## GenderM:RaceHispanic 0.0547836 0.1119560 0.489 0.6246
## GenderM:RaceOthers 0.0688073 0.0857200 0.803 0.4221
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37785 on 50081 degrees of freedom
## Residual deviance: 33802 on 50063 degrees of freedom
## AIC: 33840
##
## Number of Fisher Scoring iterations: 5
It is difficult to deal with a categorical interaction variable if the goal is to remove levels. With the interaction, there are actually 8 levels in play and they would need to be created as a new variable in order to merge some of them. Given that none of the three interaction terms appear to add value, I’ll save time and go ahead and remove them.
glmprobit <- glm(Readmission.Status ~ ., data=train, family = binomial(link="probit"))
summary(glmprobit)
##
## Call:
## glm(formula = Readmission.Status ~ ., family = binomial(link = "probit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2655 -0.5734 -0.3965 -0.2451 3.9526
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4561795 0.0741299 -19.644 < 2e-16 ***
## GenderM 0.0126145 0.0154395 0.817 0.4139
## RaceBlack 0.0192307 0.0246860 0.779 0.4360
## RaceHispanic 0.0564365 0.0553110 1.020 0.3076
## RaceOthers -0.0222827 0.0428616 -0.520 0.6031
## ER 0.0035240 0.0093298 0.378 0.7056
## Age -0.0040723 0.0008793 -4.631 3.63e-06 ***
## logLOS 0.0263916 0.0112413 2.348 0.0189 *
## logRiskscore 0.6937428 0.0123958 55.966 < 2e-16 ***
## Under65 -0.0288886 0.0313046 -0.923 0.3561
## DRGMed.NoC -0.0379484 0.0230833 -1.644 0.1002
## DRGOtherMED 0.0764142 0.0299528 2.551 0.0107 *
## DRGOtherSURG 0.0172804 0.0367224 0.471 0.6379
## DRGSurg.C -0.0114789 0.0216281 -0.531 0.5956
## DRGSurg.NoC 0.0116945 0.0234562 0.499 0.6181
## DRGUNGROUP -0.0122043 0.0786236 -0.155 0.8766
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37785 on 50081 degrees of freedom
## Residual deviance: 33804 on 50066 degrees of freedom
## AIC: 33836
##
## Number of Fisher Scoring iterations: 5
Normally the next step would be combine Race-Others with Race-White (the base) but given the similarity to Race-Hispanic, both will be combined with Race-White. This is mostly done to save time.
readmission$RaceGender <- NULL #Need to remove this variable that was created earlier.
readmission2<-readmission
library(plyr)
var <- "Race"
var.levels <- levels(readmission2[,var])
readmission2[,var] <- mapvalues(readmission2[,var],var.levels,c("NonBlack","Black","NonBlack","NonBlack"))
#Relevel
table <- as.data.frame(table(readmission2[,var]))
max <- which.max(table[,2])
level.name <- as.character(table[max,1])
readmission2[,var] <- relevel(readmission2[,var], ref = level.name)
table(readmission2[,var])
##
## NonBlack Black
## 59679 7097
Running the model again, remembering to create new train and test sets. The same partition continues to be used to keep results consistent.
readmission <- readmission2
train <- readmission[partition,]
test <- readmission[-partition,]
glmprobit <- glm(Readmission.Status ~ ., data=train, family = binomial(link="probit"))
summary(glmprobit)
##
## Call:
## glm(formula = Readmission.Status ~ ., family = binomial(link = "probit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2661 -0.5734 -0.3966 -0.2448 3.9520
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4563982 0.0740194 -19.676 < 2e-16 ***
## GenderM 0.0123990 0.0154377 0.803 0.4219
## RaceBlack 0.0187312 0.0245815 0.762 0.4461
## ER 0.0035314 0.0093294 0.379 0.7050
## Age -0.0040643 0.0008788 -4.625 3.75e-06 ***
## logLOS 0.0264274 0.0112407 2.351 0.0187 *
## logRiskscore 0.6937402 0.0123961 55.965 < 2e-16 ***
## Under65 -0.0280045 0.0312893 -0.895 0.3708
## DRGMed.NoC -0.0380316 0.0230823 -1.648 0.0994 .
## DRGOtherMED 0.0764746 0.0299516 2.553 0.0107 *
## DRGOtherSURG 0.0167651 0.0367184 0.457 0.6480
## DRGSurg.C -0.0115843 0.0216275 -0.536 0.5922
## DRGSurg.NoC 0.0114905 0.0234549 0.490 0.6242
## DRGUNGROUP -0.0121335 0.0786215 -0.154 0.8774
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37785 on 50081 degrees of freedom
## Residual deviance: 33805 on 50068 degrees of freedom
## AIC: 33833
##
## Number of Fisher Scoring iterations: 5
For similar reasons, we simultaneously merge the two insignificant DRG levels into the base.
readmission2<-readmission
var <- "DRG"
var.levels <- levels(readmission2[,var])
readmission2[,var] <- mapvalues(readmission2[,var],var.levels,c("DRGbase","Med.NoC","OtherMED","OtherSURG","DRGbase","DRGbase","UNGROUP"))
#Relevel
table <- as.data.frame(table(readmission2[,var]))
max <- which.max(table[,2])
level.name <- as.character(table[max,1])
readmission2[,var] <- relevel(readmission2[,var], ref = level.name)
table(readmission2[,var])
##
## DRGbase Med.NoC OtherMED OtherSURG UNGROUP
## 45121 12310 5357 3424 564
readmission <- readmission2
train <- readmission[partition,]
test <- readmission[-partition,]
glmprobit <- glm(Readmission.Status ~ ., data=train, family = binomial(link="probit"))
summary(glmprobit)
##
## Call:
## glm(formula = Readmission.Status ~ ., family = binomial(link = "probit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2662 -0.5731 -0.3965 -0.2448 3.9422
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4570340 0.0731439 -19.920 < 2e-16 ***
## GenderM 0.0122968 0.0154369 0.797 0.42569
## RaceBlack 0.0186226 0.0245817 0.758 0.44870
## ER 0.0035828 0.0093291 0.384 0.70095
## Age -0.0040696 0.0008788 -4.631 3.64e-06 ***
## logLOS 0.0264496 0.0112407 2.353 0.01862 *
## logRiskscore 0.6937479 0.0123953 55.968 < 2e-16 ***
## Under65 -0.0280057 0.0312892 -0.895 0.37076
## DRGMed.NoC -0.0370186 0.0201685 -1.835 0.06644 .
## DRGOtherMED 0.0774743 0.0277714 2.790 0.00528 **
## DRGOtherSURG 0.0177706 0.0349618 0.508 0.61125
## DRGUNGROUP -0.0111276 0.0778191 -0.143 0.88630
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37785 on 50081 degrees of freedom
## Residual deviance: 33806 on 50070 degrees of freedom
## AIC: 33830
##
## Number of Fisher Scoring iterations: 5
Now remove ER.
glmprobit <- glm(Readmission.Status ~ Gender + Race + Age + logLOS + logRiskscore + Under65 + DRG, data=train, family = binomial(link="probit"))
summary(glmprobit)
##
## Call:
## glm(formula = Readmission.Status ~ Gender + Race + Age + logLOS +
## logRiskscore + Under65 + DRG, family = binomial(link = "probit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2650 -0.5732 -0.3967 -0.2450 3.9438
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4551391 0.0729745 -19.940 < 2e-16 ***
## GenderM 0.0122698 0.0154363 0.795 0.42669
## RaceBlack 0.0185940 0.0245811 0.756 0.44939
## Age -0.0040694 0.0008788 -4.631 3.65e-06 ***
## logLOS 0.0263961 0.0112398 2.348 0.01885 *
## logRiskscore 0.6937771 0.0123950 55.973 < 2e-16 ***
## Under65 -0.0279778 0.0312889 -0.894 0.37123
## DRGMed.NoC -0.0370394 0.0201684 -1.837 0.06628 .
## DRGOtherMED 0.0775242 0.0277709 2.792 0.00525 **
## DRGOtherSURG 0.0177182 0.0349616 0.507 0.61230
## DRGUNGROUP -0.0111023 0.0778154 -0.143 0.88655
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37785 on 50081 degrees of freedom
## Residual deviance: 33807 on 50071 degrees of freedom
## AIC: 33829
##
## Number of Fisher Scoring iterations: 5
Now remove Gender
glmprobit <- glm(Readmission.Status ~ Race + Age + logLOS + logRiskscore + Under65 + DRG, data=train, family = binomial(link="probit"))
summary(glmprobit)
##
## Call:
## glm(formula = Readmission.Status ~ Race + Age + logLOS + logRiskscore +
## Under65 + DRG, family = binomial(link = "probit"), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2610 -0.5734 -0.3968 -0.2450 3.9488
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4454061 0.0719427 -20.091 < 2e-16 ***
## RaceBlack 0.0182451 0.0245749 0.742 0.45783
## Age -0.0041287 0.0008756 -4.715 2.41e-06 ***
## logLOS 0.0264271 0.0112398 2.351 0.01871 *
## logRiskscore 0.6939005 0.0123935 55.989 < 2e-16 ***
## Under65 -0.0288084 0.0312723 -0.921 0.35694
## DRGMed.NoC -0.0369944 0.0201681 -1.834 0.06661 .
## DRGOtherMED 0.0773078 0.0277702 2.784 0.00537 **
## DRGOtherSURG 0.0175849 0.0349617 0.503 0.61498
## DRGUNGROUP -0.0109392 0.0778137 -0.141 0.88820
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37785 on 50081 degrees of freedom
## Residual deviance: 33807 on 50072 degrees of freedom
## AIC: 33827
##
## Number of Fisher Scoring iterations: 5
Rather than remove DRGUNGROUP at this stage, it will be combined with DRGOtherSURG. The difference in significance is likely due to sample sizes.
readmission2<-readmission
var <- "DRG"
var.levels <- levels(readmission2[,var])
readmission2[,var] <- mapvalues(readmission2[,var],var.levels,c("DRGbase","Med.NoC","OtherMED","OSUngroup","OSUngroup"))
#Relevel
table <- as.data.frame(table(readmission2[,var]))
max <- which.max(table[,2])
level.name <- as.character(table[max,1])
readmission2[,var] <- relevel(readmission2[,var], ref = level.name)
table(readmission2[,var])
##
## DRGbase Med.NoC OtherMED OSUngroup
## 45121 12310 5357 3988
readmission <- readmission2
train <- readmission[partition,]
test <- readmission[-partition,]
glmprobit <- glm(Readmission.Status ~ Race + Age + logLOS + logRiskscore + Under65 + DRG, data=train, family = binomial(link="probit"))
summary(glmprobit)
##
## Call:
## glm(formula = Readmission.Status ~ Race + Age + logLOS + logRiskscore +
## Under65 + DRG, family = binomial(link = "probit"), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2610 -0.5734 -0.3967 -0.2450 3.9487
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4453515 0.0719416 -20.091 < 2e-16 ***
## RaceBlack 0.0181893 0.0245749 0.740 0.45920
## Age -0.0041283 0.0008756 -4.715 2.42e-06 ***
## logLOS 0.0264126 0.0112376 2.350 0.01875 *
## logRiskscore 0.6938423 0.0123924 55.989 < 2e-16 ***
## Under65 -0.0288134 0.0312722 -0.921 0.35686
## DRGMed.NoC -0.0369916 0.0201679 -1.834 0.06663 .
## DRGOtherMED 0.0773120 0.0277699 2.784 0.00537 **
## DRGOSUngroup 0.0130269 0.0322735 0.404 0.68648
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37785 on 50081 degrees of freedom
## Residual deviance: 33807 on 50073 degrees of freedom
## AIC: 33825
##
## Number of Fisher Scoring iterations: 5
Next to go is Med.NoC with the hightest p-value.
readmission2<-readmission
var <- "DRG"
var.levels <- levels(readmission2[,var])
readmission2[,var] <- mapvalues(readmission2[,var],var.levels,c("DRGbase","DRGbase","OtherMED","OSUngroup"))
#Relevel
table <- as.data.frame(table(readmission2[,var]))
max <- which.max(table[,2])
level.name <- as.character(table[max,1])
readmission2[,var] <- relevel(readmission2[,var], ref = level.name)
table(readmission2[,var])
##
## DRGbase OtherMED OSUngroup
## 57431 5357 3988
readmission <- readmission2
train <- readmission[partition,]
test <- readmission[-partition,]
glmprobit <- glm(Readmission.Status ~ Race + Age + logLOS + logRiskscore + Under65 + DRG, data=train, family = binomial(link="probit"))
summary(glmprobit)
##
## Call:
## glm(formula = Readmission.Status ~ Race + Age + logLOS + logRiskscore +
## Under65 + DRG, family = binomial(link = "probit"), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2609 -0.5737 -0.3964 -0.2451 3.9543
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4522233 0.0718467 -20.213 < 2e-16 ***
## RaceBlack 0.0177452 0.0245727 0.722 0.47020
## Age -0.0041311 0.0008755 -4.718 2.38e-06 ***
## logLOS 0.0261069 0.0112357 2.324 0.02015 *
## logRiskscore 0.6935189 0.0123905 55.972 < 2e-16 ***
## Under65 -0.0286239 0.0312711 -0.915 0.36001
## DRGOtherMED 0.0852906 0.0274319 3.109 0.00188 **
## DRGOSUngroup 0.0209850 0.0319836 0.656 0.51175
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37785 on 50081 degrees of freedom
## Residual deviance: 33811 on 50074 degrees of freedom
## AIC: 33827
##
## Number of Fisher Scoring iterations: 5
Now remove Race.
readmission <- readmission2
train <- readmission[partition,]
test <- readmission[-partition,]
glmprobit <- glm(Readmission.Status ~ Age + logLOS + logRiskscore + Under65 + DRG, data=train, family = binomial(link="probit"))
summary(glmprobit)
##
## Call:
## glm(formula = Readmission.Status ~ Age + logLOS + logRiskscore +
## Under65 + DRG, family = binomial(link = "probit"), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2639 -0.5736 -0.3965 -0.2452 3.9508
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4482959 0.0716476 -20.214 < 2e-16 ***
## Age -0.0041628 0.0008745 -4.760 1.94e-06 ***
## logLOS 0.0261197 0.0112356 2.325 0.02009 *
## logRiskscore 0.6936603 0.0123892 55.989 < 2e-16 ***
## Under65 -0.0273565 0.0312158 -0.876 0.38083
## DRGOtherMED 0.0854442 0.0274312 3.115 0.00184 **
## DRGOSUngroup 0.0207800 0.0319832 0.650 0.51588
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37785 on 50081 degrees of freedom
## Residual deviance: 33811 on 50075 degrees of freedom
## AIC: 33825
##
## Number of Fisher Scoring iterations: 5
Now remove Under65.
readmission <- readmission2
train <- readmission[partition,]
test <- readmission[-partition,]
glmprobit <- glm(Readmission.Status ~ Age + logLOS + logRiskscore + DRG, data=train, family = binomial(link="probit"))
summary(glmprobit)
##
## Call:
## glm(formula = Readmission.Status ~ Age + logLOS + logRiskscore +
## DRG, family = binomial(link = "probit"), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2664 -0.5740 -0.3966 -0.2453 3.9512
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4957246 0.0470470 -31.792 < 2e-16 ***
## Age -0.0035792 0.0005678 -6.303 2.91e-10 ***
## logLOS 0.0262177 0.0112352 2.334 0.01962 *
## logRiskscore 0.6931107 0.0123754 56.007 < 2e-16 ***
## DRGOtherMED 0.0852171 0.0274282 3.107 0.00189 **
## DRGOSUngroup 0.0209568 0.0319819 0.655 0.51229
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37785 on 50081 degrees of freedom
## Residual deviance: 33812 on 50076 degrees of freedom
## AIC: 33824
##
## Number of Fisher Scoring iterations: 5
Everything is now significant at the 10% level and we stop. If we wanted a lower significance level for decision making, the process would continue.