This chunk reads in the data, relevels factors, and prints a summary.

# Loading data
readmission <- read.csv(file="readmission.csv")

vars <- colnames(readmission)[c(2,3,5,9)] #variables to relevel
for (i in vars){
  table <- as.data.frame(table(readmission[,i]))
  max <- which.max(table[,2])
  level.name <- as.character(table[max,1])
  readmission[,i] <- relevel(readmission[,i], ref = level.name)
}
summary(readmission) 
##  Readmission.Status Gender          Race             ER        
##  Min.   :0.0000     F:38011   White   :56124   Min.   :0.0000  
##  1st Qu.:0.0000     M:28771   Black   : 7099   1st Qu.:0.0000  
##  Median :0.0000               Hispanic: 1286   Median :0.0000  
##  Mean   :0.1259               Others  : 2273   Mean   :0.5083  
##  3rd Qu.:0.0000                                3rd Qu.:1.0000  
##  Max.   :1.0000                                Max.   :9.0000  
##    DRG.Class          LOS              Age         HCC.Riskscore   
##  MED    :35771   Min.   : 1.000   Min.   : 24.00   Min.   : 0.079  
##  SURG   :30447   1st Qu.: 3.000   1st Qu.: 67.00   1st Qu.: 1.107  
##  UNGROUP:  564   Median : 5.000   Median : 75.00   Median : 1.865  
##                  Mean   : 6.693   Mean   : 73.64   Mean   : 2.345  
##                  3rd Qu.: 8.000   3rd Qu.: 83.00   3rd Qu.: 3.173  
##                  Max.   :36.000   Max.   :101.00   Max.   :12.307  
##       DRG.Complication
##  MedicalMCC.CC:18110  
##  MedicalNoC   :12310  
##  Other        : 9345  
##  SurgMCC.CC   :15468  
##  SurgNoC      :11549  
## 

##Task 1 I have elected to make a table for ER and histograms for the other three variables.

library(ggplot2)
table(readmission$ER)
## 
##     0     1     2     3     4     5     6     7     9 
## 43086 16280  5286  1572   438   105    10     3     2
ggplot(readmission,aes(x=LOS))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(readmission,aes(x=Age))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(readmission,aes(x=HCC.Riskscore))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

I will use log transformations of the LOS and HCC.Riskscore variables. I also create an indicator variables for those under age 65. The following chunk does these.

#two log transforms and removal of original variables
readmission$logLOS <- log(readmission$LOS)
readmission$logRiskscore <- log(readmission$HCC.Riskscore)
readmission$LOS <- NULL
readmission$HCC.Riskscore <- NULL
readmission$Under65 <- ifelse(readmission$Age < 65, 1, 0)
summary(readmission)
##  Readmission.Status Gender          Race             ER        
##  Min.   :0.0000     F:38011   White   :56124   Min.   :0.0000  
##  1st Qu.:0.0000     M:28771   Black   : 7099   1st Qu.:0.0000  
##  Median :0.0000               Hispanic: 1286   Median :0.0000  
##  Mean   :0.1259               Others  : 2273   Mean   :0.5083  
##  3rd Qu.:0.0000                                3rd Qu.:1.0000  
##  Max.   :1.0000                                Max.   :9.0000  
##    DRG.Class          Age              DRG.Complication     logLOS     
##  MED    :35771   Min.   : 24.00   MedicalMCC.CC:18110   Min.   :0.000  
##  SURG   :30447   1st Qu.: 67.00   MedicalNoC   :12310   1st Qu.:1.099  
##  UNGROUP:  564   Median : 75.00   Other        : 9345   Median :1.609  
##                  Mean   : 73.64   SurgMCC.CC   :15468   Mean   :1.653  
##                  3rd Qu.: 83.00   SurgNoC      :11549   3rd Qu.:2.079  
##                  Max.   :101.00                         Max.   :3.584  
##   logRiskscore        Under65      
##  Min.   :-2.5383   Min.   :0.0000  
##  1st Qu.: 0.1017   1st Qu.:0.0000  
##  Median : 0.6235   Median :0.0000  
##  Mean   : 0.5999   Mean   :0.1684  
##  3rd Qu.: 1.1547   3rd Qu.:0.0000  
##  Max.   : 2.5102   Max.   :1.0000

##Task 2 This chunk creates a tabular view of the two variables.

table(readmission$DRG.Class,readmission$DRG.Complication)
##          
##           MedicalMCC.CC MedicalNoC Other SurgMCC.CC SurgNoC
##   MED             18104      12310  5357          0       0
##   SURG                6          0  3424      15468   11549
##   UNGROUP             0          0   564          0       0

Six items will be deleted and the two existing variables recoded as a single factor variable. The following code does that.

readmission.new <- readmission #preserve the original data until the work can be checked
readmission.new <- readmission.new[!(readmission.new$DRG.Complication=="MedicalMCC.CC" & readmission.new$DRG.Class=="SURG"),]
readmission.new$DRG <- ifelse(readmission.new$DRG.Complication=="MedicalMCC.CC","Med.C",
                       ifelse(readmission.new$DRG.Complication=="MedicalNoC","Med.NoC",
                              ifelse(readmission.new$DRG.Complication=="SurgMCC.CC","Surg.C",
                                     ifelse(readmission.new$DRG.Complication=="SurgNoC","Surg.NoC",
                                            ifelse(readmission.new$DRG.Class=="UNGROUP","UNGROUP",
                                                   ifelse(readmission.new$DRG.Complication=="Other"&readmission.new$DRG.Class=="MED","OtherMED","OtherSURG"))))))
readmission.new$DRG <- as.factor(readmission.new$DRG)
table(readmission.new$DRG)
## 
##     Med.C   Med.NoC  OtherMED OtherSURG    Surg.C  Surg.NoC   UNGROUP 
##     18104     12310      5357      3424     15468     11549       564
readmission.new$DRG.Class <- NULL
readmission.new$DRG.Complication <- NULL

Relevel the new variable.

table <- as.data.frame(table(readmission.new[,"DRG"]))
  max <- which.max(table[,2])
  level.name <- as.character(table[max,1])
  readmission.new[,"DRG"] <- relevel(readmission.new[,"DRG"], ref = level.name)
table(readmission.new$DRG)
## 
##     Med.C   Med.NoC  OtherMED OtherSURG    Surg.C  Surg.NoC   UNGROUP 
##     18104     12310      5357      3424     15468     11549       564

Accept the new dataframe by renaming it back to readmission.

readmission <- readmission.new
readmission.new <- NULL
summary(readmission)
##  Readmission.Status Gender          Race             ER        
##  Min.   :0.0000     F:38005   White   :56120   Min.   :0.0000  
##  1st Qu.:0.0000     M:28771   Black   : 7097   1st Qu.:0.0000  
##  Median :0.0000               Hispanic: 1286   Median :0.0000  
##  Mean   :0.1259               Others  : 2273   Mean   :0.5083  
##  3rd Qu.:0.0000                                3rd Qu.:1.0000  
##  Max.   :1.0000                                Max.   :9.0000  
##                                                                
##       Age             logLOS       logRiskscore        Under65      
##  Min.   : 24.00   Min.   :0.000   Min.   :-2.5383   Min.   :0.0000  
##  1st Qu.: 67.00   1st Qu.:1.099   1st Qu.: 0.1017   1st Qu.:0.0000  
##  Median : 75.00   Median :1.609   Median : 0.6238   Median :0.0000  
##  Mean   : 73.64   Mean   :1.653   Mean   : 0.6000   Mean   :0.1684  
##  3rd Qu.: 83.00   3rd Qu.:2.079   3rd Qu.: 1.1547   3rd Qu.:0.0000  
##  Max.   :101.00   Max.   :3.584   Max.   : 2.5102   Max.   :1.0000  
##                                                                     
##         DRG       
##  Med.C    :18104  
##  Med.NoC  :12310  
##  OtherMED : 5357  
##  OtherSURG: 3424  
##  Surg.C   :15468  
##  Surg.NoC :11549  
##  UNGROUP  :  564

##Task 3 This code performs cluster analysis for from 1 to 12 clusters and constructs an elbow plot.

nstart.val <- 20
cluster_vars <- readmission[c('logLOS','Age')]
for(i in 1:ncol(cluster_vars)){
  cluster_vars[,i] <- scale(cluster_vars[,i])
}
km1 <- kmeans(cluster_vars,centers=1,nstart=nstart.val)
km2 <- kmeans(cluster_vars,centers=2,nstart=nstart.val)
km3 <- kmeans(cluster_vars,centers=3,nstart=nstart.val)
km4 <- kmeans(cluster_vars,centers=4,nstart=nstart.val)
km5 <- kmeans(cluster_vars,centers=5,nstart=nstart.val)
km6 <- kmeans(cluster_vars,centers=6,nstart=nstart.val)
km7 <- kmeans(cluster_vars,centers=7,nstart=nstart.val)
km8 <- kmeans(cluster_vars,centers=8,nstart=nstart.val)
km9 <- kmeans(cluster_vars,centers=9,nstart=nstart.val)
km10 <- kmeans(cluster_vars,centers=10,nstart=nstart.val)
km11 <- kmeans(cluster_vars,centers=11,nstart=nstart.val)
km12 <- kmeans(cluster_vars,centers=12,nstart=nstart.val)

var.exp <- data.frame(k = c(1:12),
                      bss_tss = c(km1$betweenss/km1$totss,
                                  km2$betweenss/km2$totss,
                                  km3$betweenss/km3$totss,
                                  km4$betweenss/km4$totss,
                                  km5$betweenss/km5$totss,
                                  km6$betweenss/km6$totss,
                                  km7$betweenss/km7$totss,
                                  km8$betweenss/km8$totss,
                                  km9$betweenss/km9$totss,
                                  km10$betweenss/km10$totss,
                                  km11$betweenss/km11$totss,
                                  km12$betweenss/km12$totss))

ggplot(var.exp,aes(x=k,y=bss_tss))+geom_point()

The following chunk creates the new variable based on 5 clusters.

LOS_Age_Clust <- as.factor(km5$cluster) #This creates a new variable based on having 5 clusters.
cluster_vars$LOS_Age_Clust <- LOS_Age_Clust
ggplot(data = cluster_vars, aes(x = Age, y = logLOS, col = LOS_Age_Clust)) + geom_point() + theme(axis.text = element_blank(), legend.title = element_blank()) +ggtitle("Clustering with 5 groups")

##Task 4

My first look is at ER and logRiskscore.

ggplot(readmission,aes(x=factor(Readmission.Status),y=logRiskscore)) + geom_boxplot() +facet_wrap(~factor(ER))

Trying again with Race and Gender.

ggplot(readmission,aes(Gender,fill=factor(Readmission.Status))) + geom_bar(position = "fill") +
  facet_wrap(~Race,ncol=2,scales="free")+scale_y_continuous()+ylab("percent")

##Task 5

Before fitting a GLM, I will split the data into training and test sets.

#Create train and test sets
library(caret)
## Loading required package: lattice
set.seed(4321)
partition <- createDataPartition(readmission[,1], list = FALSE, p = .75) #The partition will stratify using variable 1 from the dataframe
train <- readmission[partition, ]
test <- readmission[-partition, ]

print("TRAIN")
## [1] "TRAIN"
mean(train$Readmission.Status)
## [1] 0.1252346
print("TEST")
## [1] "TEST"
mean(test$Readmission.Status)
## [1] 0.1280101

I will now run a glm using the binomial distribution and logit link and add the desired interaction.

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
glmlogit <- glm(Readmission.Status ~ . + Gender*Race, data=train, family = binomial(link="logit"))

summary(glmlogit)
## 
## Call:
## glm(formula = Readmission.Status ~ . + Gender * Race, family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3401  -0.5625  -0.3926  -0.2590   3.5158  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -2.630481   0.137493 -19.132  < 2e-16 ***
## GenderM               0.002919   0.031255   0.093   0.9256    
## RaceBlack            -0.013201   0.060223  -0.219   0.8265    
## RaceHispanic          0.072077   0.132982   0.542   0.5878    
## RaceOthers           -0.104211   0.111986  -0.931   0.3521    
## ER                    0.004022   0.017275   0.233   0.8159    
## Age                  -0.006665   0.001624  -4.105 4.05e-05 ***
## logLOS                0.052063   0.020491   2.541   0.0111 *  
## logRiskscore          1.302494   0.023641  55.094  < 2e-16 ***
## Under65              -0.045725   0.057877  -0.790   0.4295    
## DRGMed.NoC           -0.069340   0.042810  -1.620   0.1053    
## DRGOtherMED           0.142270   0.054853   2.594   0.0095 ** 
## DRGOtherSURG          0.025573   0.067532   0.379   0.7049    
## DRGSurg.C            -0.032400   0.040172  -0.807   0.4199    
## DRGSurg.NoC           0.017780   0.043439   0.409   0.6823    
## DRGUNGROUP           -0.014643   0.142768  -0.103   0.9183    
## GenderM:RaceBlack     0.113370   0.090355   1.255   0.2096    
## GenderM:RaceHispanic  0.068279   0.205629   0.332   0.7399    
## GenderM:RaceOthers    0.154444   0.159098   0.971   0.3317    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37785  on 50081  degrees of freedom
## Residual deviance: 33818  on 50063  degrees of freedom
## AIC: 33856
## 
## Number of Fisher Scoring iterations: 5
predslogit <- predict(glmlogit,newdat=test,type="response")

roclogit <- roc(test$Readmission.Status,predslogit)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
confusionMatrix(factor(1*(predslogit>.5)),factor(test$Readmission.Status))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 14551  2132
##          1     6     5
##                                          
##                Accuracy : 0.8719         
##                  95% CI : (0.8668, 0.877)
##     No Information Rate : 0.872          
##     P-Value [Acc > NIR] : 0.515          
##                                          
##                   Kappa : 0.0033         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.99959        
##             Specificity : 0.00234        
##          Pos Pred Value : 0.87221        
##          Neg Pred Value : 0.45455        
##              Prevalence : 0.87199        
##          Detection Rate : 0.87163        
##    Detection Prevalence : 0.99934        
##       Balanced Accuracy : 0.50096        
##                                          
##        'Positive' Class : 0              
## 
plot(roclogit)

auc(roclogit)
## Area under the curve: 0.7443

The same code is now run with the probit link.

library(pROC)
glmprobit <- glm(Readmission.Status ~ . + Gender*Race, data=train, family = binomial(link="probit"))

summary(glmprobit)
## 
## Call:
## glm(formula = Readmission.Status ~ . + Gender * Race, family = binomial(link = "probit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2610  -0.5734  -0.3966  -0.2449   3.9581  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -1.4498310  0.0742169 -19.535  < 2e-16 ***
## GenderM               0.0017917  0.0168690   0.106   0.9154    
## RaceBlack            -0.0097589  0.0325639  -0.300   0.7644    
## RaceHispanic          0.0334025  0.0725431   0.460   0.6452    
## RaceOthers           -0.0546917  0.0596048  -0.918   0.3588    
## ER                    0.0035282  0.0093310   0.378   0.7053    
## Age                  -0.0040958  0.0008794  -4.658  3.2e-06 ***
## logLOS                0.0265354  0.0112425   2.360   0.0183 *  
## logRiskscore          0.6936871  0.0123968  55.957  < 2e-16 ***
## Under65              -0.0301577  0.0313121  -0.963   0.3355    
## DRGMed.NoC           -0.0378245  0.0230854  -1.638   0.1013    
## DRGOtherMED           0.0766117  0.0299537   2.558   0.0105 *  
## DRGOtherSURG          0.0164803  0.0367285   0.449   0.6536    
## DRGSurg.C            -0.0112814  0.0216294  -0.522   0.6020    
## DRGSurg.NoC           0.0119932  0.0234597   0.511   0.6092    
## DRGUNGROUP           -0.0121150  0.0786184  -0.154   0.8775    
## GenderM:RaceBlack     0.0675905  0.0492164   1.373   0.1696    
## GenderM:RaceHispanic  0.0547836  0.1119560   0.489   0.6246    
## GenderM:RaceOthers    0.0688073  0.0857200   0.803   0.4221    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37785  on 50081  degrees of freedom
## Residual deviance: 33802  on 50063  degrees of freedom
## AIC: 33840
## 
## Number of Fisher Scoring iterations: 5
predsprobit <- predict(glmprobit,newdat=test,type="response")

rocprobit <- roc(test$Readmission.Status,predsprobit)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
confusionMatrix(factor(1*(predsprobit>.5)),factor(test$Readmission.Status))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 14555  2136
##          1     2     1
##                                          
##                Accuracy : 0.8719         
##                  95% CI : (0.8668, 0.877)
##     No Information Rate : 0.872          
##     P-Value [Acc > NIR] : 0.515          
##                                          
##                   Kappa : 6e-04          
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.9998626      
##             Specificity : 0.0004679      
##          Pos Pred Value : 0.8720268      
##          Neg Pred Value : 0.3333333      
##              Prevalence : 0.8719899      
##          Detection Rate : 0.8718701      
##    Detection Prevalence : 0.9998203      
##       Balanced Accuracy : 0.5001653      
##                                          
##        'Positive' Class : 0              
## 
plot(rocprobit)

auc(rocprobit)
## Area under the curve: 0.7444

##Task 6

A new dataframe is created that drops logLOS and Age and adds LOS_Age_Clust. It must then be partitioned.

readmission.cluster <- readmission
readmission.cluster$Age <- NULL
readmission.cluster$logLOS <- NULL
readmission.cluster$LOS_Age_Clust <- LOS_Age_Clust
table(readmission.cluster$LOS_Age_Clust)
## 
##     1     2     3     4     5 
## 14274  7604 18346 10754 15798
train.cluster <- readmission.cluster[partition, ]
test.cluster <- readmission.cluster[-partition, ]

This code reruns the probit regression using the cluster variable.

library(pROC)
glmprobit.clust <- glm(Readmission.Status ~ . + Gender*Race, data=train.cluster, family = binomial(link="probit"))

summary(glmprobit.clust)
## 
## Call:
## glm(formula = Readmission.Status ~ . + Gender * Race, family = binomial(link = "probit"), 
##     data = train.cluster)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2236  -0.5731  -0.3978  -0.2453   4.0018  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -1.736969   0.025371 -68.462  < 2e-16 ***
## GenderM               0.006032   0.016858   0.358  0.72050    
## RaceBlack            -0.003182   0.032523  -0.098  0.92207    
## RaceHispanic          0.036379   0.072499   0.502  0.61582    
## RaceOthers           -0.048185   0.059557  -0.809  0.41848    
## ER                    0.003631   0.009328   0.389  0.69710    
## logRiskscore          0.688514   0.012289  56.026  < 2e-16 ***
## Under65               0.087803   0.032869   2.671  0.00755 ** 
## DRGMed.NoC           -0.037807   0.023080  -1.638  0.10140    
## DRGOtherMED           0.075491   0.029944   2.521  0.01170 *  
## DRGOtherSURG          0.017360   0.036717   0.473  0.63635    
## DRGSurg.C            -0.011607   0.021625  -0.537  0.59145    
## DRGSurg.NoC           0.011960   0.023454   0.510  0.61009    
## DRGUNGROUP           -0.018299   0.078695  -0.233  0.81612    
## LOS_Age_Clust2       -0.004442   0.041913  -0.106  0.91559    
## LOS_Age_Clust3        0.011884   0.023157   0.513  0.60781    
## LOS_Age_Clust4        0.070173   0.025051   2.801  0.00509 ** 
## LOS_Age_Clust5       -0.016639   0.023753  -0.700  0.48362    
## GenderM:RaceBlack     0.065427   0.049198   1.330  0.18356    
## GenderM:RaceHispanic  0.059008   0.111872   0.527  0.59787    
## GenderM:RaceOthers    0.065261   0.085671   0.762  0.44621    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37785  on 50081  degrees of freedom
## Residual deviance: 33814  on 50061  degrees of freedom
## AIC: 33856
## 
## Number of Fisher Scoring iterations: 5
predsprobit.clust <- predict(glmprobit.clust,newdat=test.cluster,type="response")

rocprobit.clust <- roc(test.cluster$Readmission.Status,predsprobit.clust)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
confusionMatrix(factor(1*(predsprobit.clust>.5)),factor(test.cluster$Readmission.Status))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 14554  2135
##          1     3     2
##                                          
##                Accuracy : 0.8719         
##                  95% CI : (0.8668, 0.877)
##     No Information Rate : 0.872          
##     P-Value [Acc > NIR] : 0.515          
##                                          
##                   Kappa : 0.0013         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.9997939      
##             Specificity : 0.0009359      
##          Pos Pred Value : 0.8720714      
##          Neg Pred Value : 0.4000000      
##              Prevalence : 0.8719899      
##          Detection Rate : 0.8718102      
##    Detection Prevalence : 0.9997005      
##       Balanced Accuracy : 0.5003649      
##                                          
##        'Positive' Class : 0              
## 
plot(rocprobit.clust)

auc(rocprobit.clust)
## Area under the curve: 0.7434

##Task 7

The first step is to binarize the factor levels that have more than two levels. fullRank is set to FALSE so that all levels get binarized. The interaction variable is also a factor variable and must be binarized as well. In each case the base level will then need to be deleted. I do it this way becasue if fullRank is TRUE I can’t be sure about which level becomes the base.

#Add the interaction variable to the data frame
readmission$RaceGender <- factor(paste0(readmission$Race,readmission$Gender))
summary(readmission$RaceGender)
##    BlackF    BlackM HispanicF HispanicM   OthersF   OthersM    WhiteF 
##      4157      2940       751       535      1206      1067     31891 
##    WhiteM 
##     24229
factor_names <- c("Race","DRG","RaceGender") 
factor_vars <- readmission[,factor_names]
for (var in factor_names) {
  factor_vars[, var] <- as.character(factor_vars[, var])
}


binarizer <- caret::dummyVars(paste("~", paste(factor_names, collapse = "+")) , data = factor_vars, fullRank = FALSE)
binarized_vars <- data.frame(predict(binarizer, newdata = factor_vars))
head(binarized_vars)
##   RaceBlack RaceHispanic RaceOthers RaceWhite DRGMed.C DRGMed.NoC
## 1         0            0          0         1        0          0
## 2         0            0          0         1        0          0
## 3         0            0          0         1        0          0
## 4         0            0          0         1        0          1
## 5         0            0          0         1        0          1
## 6         0            0          0         1        0          0
##   DRGOtherMED DRGOtherSURG DRGSurg.C DRGSurg.NoC DRGUNGROUP
## 1           1            0         0           0          0
## 2           0            0         0           1          0
## 3           0            0         0           1          0
## 4           0            0         0           0          0
## 5           0            0         0           0          0
## 6           0            0         1           0          0
##   RaceGenderBlackF RaceGenderBlackM RaceGenderHispanicF
## 1                0                0                   0
## 2                0                0                   0
## 3                0                0                   0
## 4                0                0                   0
## 5                0                0                   0
## 6                0                0                   0
##   RaceGenderHispanicM RaceGenderOthersF RaceGenderOthersM RaceGenderWhiteF
## 1                   0                 0                 0                0
## 2                   0                 0                 0                0
## 3                   0                 0                 0                0
## 4                   0                 0                 0                0
## 5                   0                 0                 0                0
## 6                   0                 0                 0                0
##   RaceGenderWhiteM
## 1                1
## 2                1
## 3                1
## 4                1
## 5                1
## 6                1

Now delete the three base variables.

binarized_vars$RaceWhite <- NULL
binarized_vars$DRGMed.C <- NULL
binarized_vars$RaceGenderWhiteF <- NULL
head(binarized_vars)
##   RaceBlack RaceHispanic RaceOthers DRGMed.NoC DRGOtherMED DRGOtherSURG
## 1         0            0          0          0           1            0
## 2         0            0          0          0           0            0
## 3         0            0          0          0           0            0
## 4         0            0          0          1           0            0
## 5         0            0          0          1           0            0
## 6         0            0          0          0           0            0
##   DRGSurg.C DRGSurg.NoC DRGUNGROUP RaceGenderBlackF RaceGenderBlackM
## 1         0           0          0                0                0
## 2         0           1          0                0                0
## 3         0           1          0                0                0
## 4         0           0          0                0                0
## 5         0           0          0                0                0
## 6         1           0          0                0                0
##   RaceGenderHispanicF RaceGenderHispanicM RaceGenderOthersF
## 1                   0                   0                 0
## 2                   0                   0                 0
## 3                   0                   0                 0
## 4                   0                   0                 0
## 5                   0                   0                 0
## 6                   0                   0                 0
##   RaceGenderOthersM RaceGenderWhiteM
## 1                 0                1
## 2                 0                1
## 3                 0                1
## 4                 0                1
## 5                 0                1
## 6                 0                1

I now attach the binarized variables and remove the three original factor variables. A new dataframe is created so the old one is preserved.

readmission.bin <- cbind(readmission,binarized_vars)
readmission.bin$DRG <- NULL
readmission.bin$Race <- NULL
readmission.bin$RaceGender <- NULL
summary(readmission.bin)
##  Readmission.Status Gender          ER              Age        
##  Min.   :0.0000     F:38005   Min.   :0.0000   Min.   : 24.00  
##  1st Qu.:0.0000     M:28771   1st Qu.:0.0000   1st Qu.: 67.00  
##  Median :0.0000               Median :0.0000   Median : 75.00  
##  Mean   :0.1259               Mean   :0.5083   Mean   : 73.64  
##  3rd Qu.:0.0000               3rd Qu.:1.0000   3rd Qu.: 83.00  
##  Max.   :1.0000               Max.   :9.0000   Max.   :101.00  
##      logLOS       logRiskscore        Under65         RaceBlack     
##  Min.   :0.000   Min.   :-2.5383   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.099   1st Qu.: 0.1017   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.609   Median : 0.6238   Median :0.0000   Median :0.0000  
##  Mean   :1.653   Mean   : 0.6000   Mean   :0.1684   Mean   :0.1063  
##  3rd Qu.:2.079   3rd Qu.: 1.1547   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :3.584   Max.   : 2.5102   Max.   :1.0000   Max.   :1.0000  
##   RaceHispanic       RaceOthers        DRGMed.NoC      DRGOtherMED     
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.0000   Median :0.00000  
##  Mean   :0.01926   Mean   :0.03404   Mean   :0.1843   Mean   :0.08022  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##   DRGOtherSURG       DRGSurg.C       DRGSurg.NoC      DRGUNGROUP      
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.000   Min.   :0.000000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.000000  
##  Median :0.00000   Median :0.0000   Median :0.000   Median :0.000000  
##  Mean   :0.05128   Mean   :0.2316   Mean   :0.173   Mean   :0.008446  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.000   3rd Qu.:0.000000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.000   Max.   :1.000000  
##  RaceGenderBlackF  RaceGenderBlackM  RaceGenderHispanicF
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000    
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000    
##  Median :0.00000   Median :0.00000   Median :0.00000    
##  Mean   :0.06225   Mean   :0.04403   Mean   :0.01125    
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000    
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000    
##  RaceGenderHispanicM RaceGenderOthersF RaceGenderOthersM RaceGenderWhiteM
##  Min.   :0.000000    Min.   :0.00000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.000000    1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.000000    Median :0.00000   Median :0.00000   Median :0.0000  
##  Mean   :0.008012    Mean   :0.01806   Mean   :0.01598   Mean   :0.3628  
##  3rd Qu.:0.000000    3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :1.000000    Max.   :1.00000   Max.   :1.00000   Max.   :1.0000

I need to again split the data into train and test sets.

train.bin <- readmission.bin[partition, ]
test.bin <- readmission.bin[-partition, ]

I next run the GLM on the binarized data. In doing so, I recognized that the interaction variable created combinations that were redundant. I need to remove all the interactions with male and then re-partition the data.

readmission.bin$RaceGenderOthersM <- NULL
readmission.bin$RaceGenderWhiteM <- NULL
readmission.bin$RaceGenderHispanicM <- NULL
readmission.bin$RaceGenderBlackM <- NULL
summary(readmission.bin)
##  Readmission.Status Gender          ER              Age        
##  Min.   :0.0000     F:38005   Min.   :0.0000   Min.   : 24.00  
##  1st Qu.:0.0000     M:28771   1st Qu.:0.0000   1st Qu.: 67.00  
##  Median :0.0000               Median :0.0000   Median : 75.00  
##  Mean   :0.1259               Mean   :0.5083   Mean   : 73.64  
##  3rd Qu.:0.0000               3rd Qu.:1.0000   3rd Qu.: 83.00  
##  Max.   :1.0000               Max.   :9.0000   Max.   :101.00  
##      logLOS       logRiskscore        Under65         RaceBlack     
##  Min.   :0.000   Min.   :-2.5383   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.099   1st Qu.: 0.1017   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.609   Median : 0.6238   Median :0.0000   Median :0.0000  
##  Mean   :1.653   Mean   : 0.6000   Mean   :0.1684   Mean   :0.1063  
##  3rd Qu.:2.079   3rd Qu.: 1.1547   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :3.584   Max.   : 2.5102   Max.   :1.0000   Max.   :1.0000  
##   RaceHispanic       RaceOthers        DRGMed.NoC      DRGOtherMED     
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.0000   Median :0.00000  
##  Mean   :0.01926   Mean   :0.03404   Mean   :0.1843   Mean   :0.08022  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##   DRGOtherSURG       DRGSurg.C       DRGSurg.NoC      DRGUNGROUP      
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.000   Min.   :0.000000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.000000  
##  Median :0.00000   Median :0.0000   Median :0.000   Median :0.000000  
##  Mean   :0.05128   Mean   :0.2316   Mean   :0.173   Mean   :0.008446  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.000   3rd Qu.:0.000000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.000   Max.   :1.000000  
##  RaceGenderBlackF  RaceGenderHispanicF RaceGenderOthersF
##  Min.   :0.00000   Min.   :0.00000     Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000     1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000     Median :0.00000  
##  Mean   :0.06225   Mean   :0.01125     Mean   :0.01806  
##  3rd Qu.:0.00000   3rd Qu.:0.00000     3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000     Max.   :1.00000
train.bin <- readmission.bin[partition, ]
test.bin <- readmission.bin[-partition, ]
glmprobit <- glm(Readmission.Status ~ . , data=train.bin, family = binomial(link="probit"))

summary(glmprobit)
## 
## Call:
## glm(formula = Readmission.Status ~ ., family = binomial(link = "probit"), 
##     data = train.bin)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2610  -0.5734  -0.3966  -0.2449   3.9581  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -1.4498310  0.0742169 -19.535  < 2e-16 ***
## GenderM              0.0017917  0.0168690   0.106   0.9154    
## ER                   0.0035282  0.0093310   0.378   0.7053    
## Age                 -0.0040958  0.0008794  -4.658  3.2e-06 ***
## logLOS               0.0265354  0.0112425   2.360   0.0183 *  
## logRiskscore         0.6936871  0.0123968  55.957  < 2e-16 ***
## Under65             -0.0301577  0.0313121  -0.963   0.3355    
## RaceBlack            0.0578316  0.0373272   1.549   0.1213    
## RaceHispanic         0.0881861  0.0853691   1.033   0.3016    
## RaceOthers           0.0141156  0.0616693   0.229   0.8190    
## DRGMed.NoC          -0.0378245  0.0230854  -1.638   0.1013    
## DRGOtherMED          0.0766117  0.0299537   2.558   0.0105 *  
## DRGOtherSURG         0.0164803  0.0367285   0.449   0.6536    
## DRGSurg.C           -0.0112814  0.0216294  -0.522   0.6020    
## DRGSurg.NoC          0.0119932  0.0234597   0.511   0.6092    
## DRGUNGROUP          -0.0121150  0.0786184  -0.154   0.8775    
## RaceGenderBlackF    -0.0675905  0.0492164  -1.373   0.1696    
## RaceGenderHispanicF -0.0547836  0.1119560  -0.489   0.6246    
## RaceGenderOthersF   -0.0688073  0.0857200  -0.803   0.4221    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37785  on 50081  degrees of freedom
## Residual deviance: 33802  on 50063  degrees of freedom
## AIC: 33840
## 
## Number of Fisher Scoring iterations: 5
predsprobit <- predict(glmprobit,newdat=test.bin,type="response")

rocprobit <- roc(test.bin$Readmission.Status,predsprobit)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
confusionMatrix(factor(1*(predsprobit>.5)),factor(test.bin$Readmission.Status))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 14555  2136
##          1     2     1
##                                          
##                Accuracy : 0.8719         
##                  95% CI : (0.8668, 0.877)
##     No Information Rate : 0.872          
##     P-Value [Acc > NIR] : 0.515          
##                                          
##                   Kappa : 6e-04          
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.9998626      
##             Specificity : 0.0004679      
##          Pos Pred Value : 0.8720268      
##          Neg Pred Value : 0.3333333      
##              Prevalence : 0.8719899      
##          Detection Rate : 0.8718701      
##    Detection Prevalence : 0.9998203      
##       Balanced Accuracy : 0.5001653      
##                                          
##        'Positive' Class : 0              
## 
plot(rocprobit)

auc(rocprobit)
## Area under the curve: 0.7444

The next step is to run stepAIC on this model.

library(MASS)
stepAIC(glmprobit)
## Start:  AIC=33839.56
## Readmission.Status ~ Gender + ER + Age + logLOS + logRiskscore + 
##     Under65 + RaceBlack + RaceHispanic + RaceOthers + DRGMed.NoC + 
##     DRGOtherMED + DRGOtherSURG + DRGSurg.C + DRGSurg.NoC + DRGUNGROUP + 
##     RaceGenderBlackF + RaceGenderHispanicF + RaceGenderOthersF
## 
##                       Df Deviance   AIC
## - Gender               1    33802 33838
## - DRGUNGROUP           1    33802 33838
## - RaceOthers           1    33802 33838
## - ER                   1    33802 33838
## - DRGOtherSURG         1    33802 33838
## - RaceGenderHispanicF  1    33802 33838
## - DRGSurg.NoC          1    33802 33838
## - DRGSurg.C            1    33802 33838
## - RaceGenderOthersF    1    33802 33838
## - Under65              1    33802 33838
## - RaceHispanic         1    33803 33839
## - RaceGenderBlackF     1    33803 33839
## <none>                      33802 33840
## - RaceBlack            1    33804 33840
## - DRGMed.NoC           1    33804 33840
## - logLOS               1    33807 33843
## - DRGOtherMED          1    33808 33844
## - Age                  1    33823 33859
## - logRiskscore         1    37448 37484
## 
## Step:  AIC=33837.58
## Readmission.Status ~ ER + Age + logLOS + logRiskscore + Under65 + 
##     RaceBlack + RaceHispanic + RaceOthers + DRGMed.NoC + DRGOtherMED + 
##     DRGOtherSURG + DRGSurg.C + DRGSurg.NoC + DRGUNGROUP + RaceGenderBlackF + 
##     RaceGenderHispanicF + RaceGenderOthersF
## 
##                       Df Deviance   AIC
## - DRGUNGROUP           1    33802 33836
## - RaceOthers           1    33802 33836
## - ER                   1    33802 33836
## - DRGOtherSURG         1    33802 33836
## - DRGSurg.NoC          1    33802 33836
## - RaceGenderHispanicF  1    33802 33836
## - DRGSurg.C            1    33802 33836
## - RaceGenderOthersF    1    33802 33836
## - Under65              1    33803 33837
## - RaceHispanic         1    33803 33837
## <none>                      33802 33838
## - RaceGenderBlackF     1    33804 33838
## - RaceBlack            1    33804 33838
## - DRGMed.NoC           1    33804 33838
## - logLOS               1    33807 33841
## - DRGOtherMED          1    33808 33842
## - Age                  1    33823 33857
## - logRiskscore         1    37448 37482
## 
## Step:  AIC=33835.6
## Readmission.Status ~ ER + Age + logLOS + logRiskscore + Under65 + 
##     RaceBlack + RaceHispanic + RaceOthers + DRGMed.NoC + DRGOtherMED + 
##     DRGOtherSURG + DRGSurg.C + DRGSurg.NoC + RaceGenderBlackF + 
##     RaceGenderHispanicF + RaceGenderOthersF
## 
##                       Df Deviance   AIC
## - RaceOthers           1    33802 33834
## - ER                   1    33802 33834
## - DRGOtherSURG         1    33802 33834
## - DRGSurg.C            1    33802 33834
## - RaceGenderHispanicF  1    33802 33834
## - DRGSurg.NoC          1    33802 33834
## - RaceGenderOthersF    1    33802 33834
## - Under65              1    33803 33835
## - RaceHispanic         1    33803 33835
## <none>                      33802 33836
## - RaceGenderBlackF     1    33804 33836
## - RaceBlack            1    33804 33836
## - DRGMed.NoC           1    33804 33836
## - logLOS               1    33807 33839
## - DRGOtherMED          1    33808 33840
## - Age                  1    33824 33856
## - logRiskscore         1    37449 37481
## 
## Step:  AIC=33833.66
## Readmission.Status ~ ER + Age + logLOS + logRiskscore + Under65 + 
##     RaceBlack + RaceHispanic + DRGMed.NoC + DRGOtherMED + DRGOtherSURG + 
##     DRGSurg.C + DRGSurg.NoC + RaceGenderBlackF + RaceGenderHispanicF + 
##     RaceGenderOthersF
## 
##                       Df Deviance   AIC
## - ER                   1    33802 33832
## - DRGOtherSURG         1    33802 33832
## - DRGSurg.C            1    33802 33832
## - RaceGenderHispanicF  1    33802 33832
## - DRGSurg.NoC          1    33802 33832
## - RaceGenderOthersF    1    33803 33833
## - Under65              1    33803 33833
## - RaceHispanic         1    33803 33833
## <none>                      33802 33834
## - RaceGenderBlackF     1    33804 33834
## - RaceBlack            1    33804 33834
## - DRGMed.NoC           1    33804 33834
## - logLOS               1    33807 33837
## - DRGOtherMED          1    33808 33838
## - Age                  1    33824 33854
## - logRiskscore         1    37449 37479
## 
## Step:  AIC=33831.8
## Readmission.Status ~ Age + logLOS + logRiskscore + Under65 + 
##     RaceBlack + RaceHispanic + DRGMed.NoC + DRGOtherMED + DRGOtherSURG + 
##     DRGSurg.C + DRGSurg.NoC + RaceGenderBlackF + RaceGenderHispanicF + 
##     RaceGenderOthersF
## 
##                       Df Deviance   AIC
## - DRGOtherSURG         1    33802 33830
## - DRGSurg.C            1    33802 33830
## - RaceGenderHispanicF  1    33802 33830
## - DRGSurg.NoC          1    33802 33830
## - RaceGenderOthersF    1    33803 33831
## - Under65              1    33803 33831
## - RaceHispanic         1    33803 33831
## <none>                      33802 33832
## - RaceGenderBlackF     1    33804 33832
## - RaceBlack            1    33804 33832
## - DRGMed.NoC           1    33804 33832
## - logLOS               1    33807 33835
## - DRGOtherMED          1    33808 33836
## - Age                  1    33824 33852
## - logRiskscore         1    37450 37478
## 
## Step:  AIC=33830.01
## Readmission.Status ~ Age + logLOS + logRiskscore + Under65 + 
##     RaceBlack + RaceHispanic + DRGMed.NoC + DRGOtherMED + DRGSurg.C + 
##     DRGSurg.NoC + RaceGenderBlackF + RaceGenderHispanicF + RaceGenderOthersF
## 
##                       Df Deviance   AIC
## - DRGSurg.NoC          1    33802 33828
## - RaceGenderHispanicF  1    33802 33828
## - DRGSurg.C            1    33802 33828
## - RaceGenderOthersF    1    33803 33829
## - Under65              1    33803 33829
## - RaceHispanic         1    33803 33829
## <none>                      33802 33830
## - RaceGenderBlackF     1    33804 33830
## - RaceBlack            1    33805 33831
## - DRGMed.NoC           1    33805 33831
## - logLOS               1    33808 33834
## - DRGOtherMED          1    33808 33834
## - Age                  1    33824 33850
## - logRiskscore         1    37450 37476
## 
## Step:  AIC=33828.2
## Readmission.Status ~ Age + logLOS + logRiskscore + Under65 + 
##     RaceBlack + RaceHispanic + DRGMed.NoC + DRGOtherMED + DRGSurg.C + 
##     RaceGenderBlackF + RaceGenderHispanicF + RaceGenderOthersF
## 
##                       Df Deviance   AIC
## - RaceGenderHispanicF  1    33802 33826
## - DRGSurg.C            1    33803 33827
## - RaceGenderOthersF    1    33803 33827
## - Under65              1    33803 33827
## - RaceHispanic         1    33803 33827
## <none>                      33802 33828
## - RaceGenderBlackF     1    33804 33828
## - RaceBlack            1    33805 33829
## - DRGMed.NoC           1    33807 33831
## - logLOS               1    33808 33832
## - DRGOtherMED          1    33808 33832
## - Age                  1    33824 33848
## - logRiskscore         1    37450 37474
## 
## Step:  AIC=33826.46
## Readmission.Status ~ Age + logLOS + logRiskscore + Under65 + 
##     RaceBlack + RaceHispanic + DRGMed.NoC + DRGOtherMED + DRGSurg.C + 
##     RaceGenderBlackF + RaceGenderOthersF
## 
##                     Df Deviance   AIC
## - DRGSurg.C          1    33803 33825
## - RaceGenderOthersF  1    33803 33825
## - Under65            1    33803 33825
## - RaceHispanic       1    33803 33825
## <none>                    33802 33826
## - RaceGenderBlackF   1    33805 33827
## - RaceBlack          1    33805 33827
## - DRGMed.NoC         1    33807 33829
## - logLOS             1    33808 33830
## - DRGOtherMED        1    33809 33831
## - Age                1    33825 33847
## - logRiskscore       1    37450 37472
## 
## Step:  AIC=33825.22
## Readmission.Status ~ Age + logLOS + logRiskscore + Under65 + 
##     RaceBlack + RaceHispanic + DRGMed.NoC + DRGOtherMED + RaceGenderBlackF + 
##     RaceGenderOthersF
## 
##                     Df Deviance   AIC
## - RaceGenderOthersF  1    33804 33824
## - Under65            1    33804 33824
## - RaceHispanic       1    33804 33824
## <none>                    33803 33825
## - RaceGenderBlackF   1    33805 33825
## - RaceBlack          1    33806 33826
## - DRGMed.NoC         1    33807 33827
## - logLOS             1    33809 33829
## - DRGOtherMED        1    33811 33831
## - Age                1    33825 33845
## - logRiskscore       1    37453 37473
## 
## Step:  AIC=33824.13
## Readmission.Status ~ Age + logLOS + logRiskscore + Under65 + 
##     RaceBlack + RaceHispanic + DRGMed.NoC + DRGOtherMED + RaceGenderBlackF
## 
##                    Df Deviance   AIC
## - Under65           1    33805 33823
## - RaceHispanic      1    33805 33823
## <none>                   33804 33824
## - RaceGenderBlackF  1    33806 33824
## - RaceBlack         1    33807 33825
## - DRGMed.NoC        1    33808 33826
## - logLOS            1    33810 33828
## - DRGOtherMED       1    33812 33830
## - Age               1    33826 33844
## - logRiskscore      1    37454 37472
## 
## Step:  AIC=33823.07
## Readmission.Status ~ Age + logLOS + logRiskscore + RaceBlack + 
##     RaceHispanic + DRGMed.NoC + DRGOtherMED + RaceGenderBlackF
## 
##                    Df Deviance   AIC
## - RaceHispanic      1    33806 33822
## <none>                   33805 33823
## - RaceGenderBlackF  1    33807 33823
## - RaceBlack         1    33808 33824
## - DRGMed.NoC        1    33809 33825
## - logLOS            1    33811 33827
## - DRGOtherMED       1    33813 33829
## - Age               1    33841 33857
## - logRiskscore      1    37461 37477
## 
## Step:  AIC=33822.06
## Readmission.Status ~ Age + logLOS + logRiskscore + RaceBlack + 
##     DRGMed.NoC + DRGOtherMED + RaceGenderBlackF
## 
##                    Df Deviance   AIC
## <none>                   33806 33822
## - RaceGenderBlackF  1    33808 33822
## - RaceBlack         1    33808 33822
## - DRGMed.NoC        1    33810 33824
## - logLOS            1    33812 33826
## - DRGOtherMED       1    33814 33828
## - Age               1    33843 33857
## - logRiskscore      1    37462 37476
## 
## Call:  glm(formula = Readmission.Status ~ Age + logLOS + logRiskscore + 
##     RaceBlack + DRGMed.NoC + DRGOtherMED + RaceGenderBlackF, 
##     family = binomial(link = "probit"), data = train.bin)
## 
## Coefficients:
##      (Intercept)               Age            logLOS      logRiskscore  
##        -1.496292         -0.003491          0.026865          0.693134  
##        RaceBlack        DRGMed.NoC       DRGOtherMED  RaceGenderBlackF  
##         0.056380         -0.038102          0.076249         -0.068861  
## 
## Degrees of Freedom: 50081 Total (i.e. Null);  50074 Residual
## Null Deviance:       37780 
## Residual Deviance: 33810     AIC: 33820

The probit model is now run using the five surviving variables.

glmprobit <- glm(Readmission.Status ~ logLOS + Age + logRiskscore + DRGOtherSURG + DRGOtherMED, data=train.bin, family = binomial(link="probit"))

summary(glmprobit)
## 
## Call:
## glm(formula = Readmission.Status ~ logLOS + Age + logRiskscore + 
##     DRGOtherSURG + DRGOtherMED, family = binomial(link = "probit"), 
##     data = train.bin)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2665  -0.5741  -0.3967  -0.2453   3.9514  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.4957724  0.0470444 -31.795  < 2e-16 ***
## logLOS        0.0262253  0.0112344   2.334  0.01958 *  
## Age          -0.0035797  0.0005678  -6.304 2.89e-10 ***
## logRiskscore  0.6931615  0.0123750  56.013  < 2e-16 ***
## DRGOtherSURG  0.0254804  0.0346815   0.735  0.46252    
## DRGOtherMED   0.0852455  0.0274126   3.110  0.00187 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37785  on 50081  degrees of freedom
## Residual deviance: 33812  on 50076  degrees of freedom
## AIC: 33824
## 
## Number of Fisher Scoring iterations: 5
predsprobit <- predict(glmprobit,newdat=test.bin,type="response")

rocprobit <- roc(test.bin$Readmission.Status,predsprobit)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
confusionMatrix(factor(1*(predsprobit>.5)),factor(test.bin$Readmission.Status))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 14554  2134
##          1     3     3
##                                          
##                Accuracy : 0.872          
##                  95% CI : (0.8668, 0.877)
##     No Information Rate : 0.872          
##     P-Value [Acc > NIR] : 0.5058         
##                                          
##                   Kappa : 0.0021         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.999794       
##             Specificity : 0.001404       
##          Pos Pred Value : 0.872124       
##          Neg Pred Value : 0.500000       
##              Prevalence : 0.871990       
##          Detection Rate : 0.871810       
##    Detection Prevalence : 0.999641       
##       Balanced Accuracy : 0.500599       
##                                          
##        'Positive' Class : 0              
## 
plot(rocprobit)

auc(rocprobit)
## Area under the curve: 0.7451

##Task 8

I first run the model on the full dataset.

glmprobit <- glm(Readmission.Status ~ logLOS + Age + logRiskscore + DRGOtherSURG + DRGOtherMED, data=readmission.bin, family = binomial(link="probit"))

glmprobit2 <- glm(Readmission.Status ~ logLOS + Age + logRiskscore + DRGMed.NoC + DRGOtherMED, data=readmission.bin, family = binomial(link="probit"))

summary(glmprobit)
## 
## Call:
## glm(formula = Readmission.Status ~ logLOS + Age + logRiskscore + 
##     DRGOtherSURG + DRGOtherMED, family = binomial(link = "probit"), 
##     data = readmission.bin)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2680  -0.5747  -0.3952  -0.2421   3.9745  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.5026235  0.0407530 -36.871  < 2e-16 ***
## logLOS        0.0329948  0.0097365   3.389 0.000702 ***
## Age          -0.0036999  0.0004916  -7.526 5.25e-14 ***
## logRiskscore  0.7008229  0.0107283  65.325  < 2e-16 ***
## DRGOtherSURG  0.0560715  0.0293707   1.909 0.056250 .  
## DRGOtherMED   0.0691546  0.0237996   2.906 0.003664 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 50559  on 66775  degrees of freedom
## Residual deviance: 45116  on 66770  degrees of freedom
## AIC: 45128
## 
## Number of Fisher Scoring iterations: 5
summary(glmprobit2)
## 
## Call:
## glm(formula = Readmission.Status ~ logLOS + Age + logRiskscore + 
##     DRGMed.NoC + DRGOtherMED, family = binomial(link = "probit"), 
##     data = readmission.bin)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2683  -0.5749  -0.3953  -0.2418   3.9690  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.4961959  0.0408457 -36.630  < 2e-16 ***
## logLOS        0.0336166  0.0097341   3.454 0.000553 ***
## Age          -0.0037051  0.0004916  -7.536 4.84e-14 ***
## logRiskscore  0.7010516  0.0107302  65.334  < 2e-16 ***
## DRGMed.NoC   -0.0202730  0.0172126  -1.178 0.238878    
## DRGOtherMED   0.0617021  0.0239867   2.572 0.010101 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 50559  on 66775  degrees of freedom
## Residual deviance: 45118  on 66770  degrees of freedom
## AIC: 45130
## 
## Number of Fisher Scoring iterations: 5

I now create some arbitrary patients to learn how changing the values affects the predicted probability of readmission.

new.data <- data.frame("logLOS" = c(log(5),log(6),log(5),log(5),log(5),log(5)), "Age" = c(75,75,80,75,75,75), "logRiskscore" = c(log(1.866),log(1.866),log(1.866),log(2.053),log(1.866),log(1.866)), "DRGOtherSURG" = c(0,0,0,0,1,0), "DRGOtherMED" = c(0,0,0,0,0,1))
new.data
##     logLOS Age logRiskscore DRGOtherSURG DRGOtherMED
## 1 1.609438  75    0.6237971            0           0
## 2 1.791759  75    0.6237971            0           0
## 3 1.609438  80    0.6237971            0           0
## 4 1.609438  75    0.7193021            0           0
## 5 1.609438  75    0.6237971            1           0
## 6 1.609438  75    0.6237971            0           1
predict(glmprobit, newdat = new.data, type = "response")
##          1          2          3          4          5          6 
## 0.09855331 0.09960192 0.09537932 0.11068247 0.10864481 0.11110281

##Task 9

I rerun the model on the full dataset.

glmprobit <- glm(Readmission.Status ~ logLOS + Age + logRiskscore + DRGOtherSURG + DRGOtherMED, data=readmission.bin, family = binomial(link="probit"))

summary(glmprobit)
## 
## Call:
## glm(formula = Readmission.Status ~ logLOS + Age + logRiskscore + 
##     DRGOtherSURG + DRGOtherMED, family = binomial(link = "probit"), 
##     data = readmission.bin)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2680  -0.5747  -0.3952  -0.2421   3.9745  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.5026235  0.0407530 -36.871  < 2e-16 ***
## logLOS        0.0329948  0.0097365   3.389 0.000702 ***
## Age          -0.0036999  0.0004916  -7.526 5.25e-14 ***
## logRiskscore  0.7008229  0.0107283  65.325  < 2e-16 ***
## DRGOtherSURG  0.0560715  0.0293707   1.909 0.056250 .  
## DRGOtherMED   0.0691546  0.0237996   2.906 0.003664 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 50559  on 66775  degrees of freedom
## Residual deviance: 45116  on 66770  degrees of freedom
## AIC: 45128
## 
## Number of Fisher Scoring iterations: 5
predsprobit <- predict(glmprobit,newdat=readmission.bin,type="response")


confusionMatrix(factor(1*(predsprobit>.5)),factor(readmission.bin$Readmission.Status))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 58356  8399
##          1    11    10
##                                           
##                Accuracy : 0.8741          
##                  95% CI : (0.8715, 0.8766)
##     No Information Rate : 0.8741          
##     P-Value [Acc > NIR] : 0.5076          
##                                           
##                   Kappa : 0.0017          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.999812        
##             Specificity : 0.001189        
##          Pos Pred Value : 0.874182        
##          Neg Pred Value : 0.476190        
##              Prevalence : 0.874072        
##          Detection Rate : 0.873907        
##    Detection Prevalence : 0.999686        
##       Balanced Accuracy : 0.500500        
##                                           
##        'Positive' Class : 0               
## 

This code calculates the cost for different cutoff values. I ran it at different values and am presting the final choice of 0.08.

cutoff <- 0.08
pred_readmit <- 1*(predsprobit > cutoff)
cm <- confusionMatrix(factor(pred_readmit),factor(readmission.bin$Readmission.Status))

no_intervention_cost <- 25*sum(readmission.bin$Readmission.Status == 1)
full_intervention_cost <- 2*nrow(readmission.bin)
modified_cost <- cm$table[2,1]*2+cm$table[2,2]*2+cm$table[1,2]*25
no_intervention_cost
## [1] 210225
full_intervention_cost
## [1] 133552
modified_cost
## [1] 106002

The final step is to get the confusion matrix.

cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 26379  1096
##          1 31988  7313
##                                           
##                Accuracy : 0.5046          
##                  95% CI : (0.5008, 0.5084)
##     No Information Rate : 0.8741          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.125           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.4520          
##             Specificity : 0.8697          
##          Pos Pred Value : 0.9601          
##          Neg Pred Value : 0.1861          
##              Prevalence : 0.8741          
##          Detection Rate : 0.3950          
##    Detection Prevalence : 0.4115          
##       Balanced Accuracy : 0.6608          
##                                           
##        'Positive' Class : 0               
## 

##ALTERNATIVE FEATURE SELECTION

This section presents an alternative approach. It is based on using hypothesis tests to sequentially remove features. It begins by re-running the probit model on the training set using all the available features.

glmprobit <- glm(Readmission.Status ~ . + Race*Gender, data=train, family = binomial(link="probit"))

summary(glmprobit)
## 
## Call:
## glm(formula = Readmission.Status ~ . + Race * Gender, family = binomial(link = "probit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2610  -0.5734  -0.3966  -0.2449   3.9581  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -1.4498310  0.0742169 -19.535  < 2e-16 ***
## GenderM               0.0017917  0.0168690   0.106   0.9154    
## RaceBlack            -0.0097589  0.0325639  -0.300   0.7644    
## RaceHispanic          0.0334025  0.0725431   0.460   0.6452    
## RaceOthers           -0.0546917  0.0596048  -0.918   0.3588    
## ER                    0.0035282  0.0093310   0.378   0.7053    
## Age                  -0.0040958  0.0008794  -4.658  3.2e-06 ***
## logLOS                0.0265354  0.0112425   2.360   0.0183 *  
## logRiskscore          0.6936871  0.0123968  55.957  < 2e-16 ***
## Under65              -0.0301577  0.0313121  -0.963   0.3355    
## DRGMed.NoC           -0.0378245  0.0230854  -1.638   0.1013    
## DRGOtherMED           0.0766117  0.0299537   2.558   0.0105 *  
## DRGOtherSURG          0.0164803  0.0367285   0.449   0.6536    
## DRGSurg.C            -0.0112814  0.0216294  -0.522   0.6020    
## DRGSurg.NoC           0.0119932  0.0234597   0.511   0.6092    
## DRGUNGROUP           -0.0121150  0.0786184  -0.154   0.8775    
## GenderM:RaceBlack     0.0675905  0.0492164   1.373   0.1696    
## GenderM:RaceHispanic  0.0547836  0.1119560   0.489   0.6246    
## GenderM:RaceOthers    0.0688073  0.0857200   0.803   0.4221    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37785  on 50081  degrees of freedom
## Residual deviance: 33802  on 50063  degrees of freedom
## AIC: 33840
## 
## Number of Fisher Scoring iterations: 5

It is difficult to deal with a categorical interaction variable if the goal is to remove levels. With the interaction, there are actually 8 levels in play and they would need to be created as a new variable in order to merge some of them. Given that none of the three interaction terms appear to add value, I’ll save time and go ahead and remove them.

glmprobit <- glm(Readmission.Status ~ ., data=train, family = binomial(link="probit")) 

summary(glmprobit)
## 
## Call:
## glm(formula = Readmission.Status ~ ., family = binomial(link = "probit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2655  -0.5734  -0.3965  -0.2451   3.9526  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.4561795  0.0741299 -19.644  < 2e-16 ***
## GenderM       0.0126145  0.0154395   0.817   0.4139    
## RaceBlack     0.0192307  0.0246860   0.779   0.4360    
## RaceHispanic  0.0564365  0.0553110   1.020   0.3076    
## RaceOthers   -0.0222827  0.0428616  -0.520   0.6031    
## ER            0.0035240  0.0093298   0.378   0.7056    
## Age          -0.0040723  0.0008793  -4.631 3.63e-06 ***
## logLOS        0.0263916  0.0112413   2.348   0.0189 *  
## logRiskscore  0.6937428  0.0123958  55.966  < 2e-16 ***
## Under65      -0.0288886  0.0313046  -0.923   0.3561    
## DRGMed.NoC   -0.0379484  0.0230833  -1.644   0.1002    
## DRGOtherMED   0.0764142  0.0299528   2.551   0.0107 *  
## DRGOtherSURG  0.0172804  0.0367224   0.471   0.6379    
## DRGSurg.C    -0.0114789  0.0216281  -0.531   0.5956    
## DRGSurg.NoC   0.0116945  0.0234562   0.499   0.6181    
## DRGUNGROUP   -0.0122043  0.0786236  -0.155   0.8766    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37785  on 50081  degrees of freedom
## Residual deviance: 33804  on 50066  degrees of freedom
## AIC: 33836
## 
## Number of Fisher Scoring iterations: 5

Normally the next step would be combine Race-Others with Race-White (the base) but given the similarity to Race-Hispanic, both will be combined with Race-White. This is mostly done to save time.

readmission$RaceGender <- NULL #Need to remove this variable that was created earlier.

readmission2<-readmission 

library(plyr)
var <- "Race"
var.levels <- levels(readmission2[,var])
readmission2[,var] <- mapvalues(readmission2[,var],var.levels,c("NonBlack","Black","NonBlack","NonBlack"))
#Relevel
table <- as.data.frame(table(readmission2[,var]))
  max <- which.max(table[,2])
  level.name <- as.character(table[max,1])
  readmission2[,var] <- relevel(readmission2[,var], ref = level.name)

table(readmission2[,var])
## 
## NonBlack    Black 
##    59679     7097

Running the model again, remembering to create new train and test sets. The same partition continues to be used to keep results consistent.

readmission <- readmission2
train <- readmission[partition,]
test <- readmission[-partition,]
glmprobit <- glm(Readmission.Status ~ ., data=train, family = binomial(link="probit")) 

summary(glmprobit)
## 
## Call:
## glm(formula = Readmission.Status ~ ., family = binomial(link = "probit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2661  -0.5734  -0.3966  -0.2448   3.9520  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.4563982  0.0740194 -19.676  < 2e-16 ***
## GenderM       0.0123990  0.0154377   0.803   0.4219    
## RaceBlack     0.0187312  0.0245815   0.762   0.4461    
## ER            0.0035314  0.0093294   0.379   0.7050    
## Age          -0.0040643  0.0008788  -4.625 3.75e-06 ***
## logLOS        0.0264274  0.0112407   2.351   0.0187 *  
## logRiskscore  0.6937402  0.0123961  55.965  < 2e-16 ***
## Under65      -0.0280045  0.0312893  -0.895   0.3708    
## DRGMed.NoC   -0.0380316  0.0230823  -1.648   0.0994 .  
## DRGOtherMED   0.0764746  0.0299516   2.553   0.0107 *  
## DRGOtherSURG  0.0167651  0.0367184   0.457   0.6480    
## DRGSurg.C    -0.0115843  0.0216275  -0.536   0.5922    
## DRGSurg.NoC   0.0114905  0.0234549   0.490   0.6242    
## DRGUNGROUP   -0.0121335  0.0786215  -0.154   0.8774    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37785  on 50081  degrees of freedom
## Residual deviance: 33805  on 50068  degrees of freedom
## AIC: 33833
## 
## Number of Fisher Scoring iterations: 5

For similar reasons, we simultaneously merge the two insignificant DRG levels into the base.

readmission2<-readmission 

var <- "DRG"
var.levels <- levels(readmission2[,var])
readmission2[,var] <- mapvalues(readmission2[,var],var.levels,c("DRGbase","Med.NoC","OtherMED","OtherSURG","DRGbase","DRGbase","UNGROUP"))
#Relevel
table <- as.data.frame(table(readmission2[,var]))
  max <- which.max(table[,2])
  level.name <- as.character(table[max,1])
  readmission2[,var] <- relevel(readmission2[,var], ref = level.name)

table(readmission2[,var])
## 
##   DRGbase   Med.NoC  OtherMED OtherSURG   UNGROUP 
##     45121     12310      5357      3424       564
readmission <- readmission2
train <- readmission[partition,]
test <- readmission[-partition,]
glmprobit <- glm(Readmission.Status ~ ., data=train, family = binomial(link="probit")) 

summary(glmprobit)
## 
## Call:
## glm(formula = Readmission.Status ~ ., family = binomial(link = "probit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2662  -0.5731  -0.3965  -0.2448   3.9422  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.4570340  0.0731439 -19.920  < 2e-16 ***
## GenderM       0.0122968  0.0154369   0.797  0.42569    
## RaceBlack     0.0186226  0.0245817   0.758  0.44870    
## ER            0.0035828  0.0093291   0.384  0.70095    
## Age          -0.0040696  0.0008788  -4.631 3.64e-06 ***
## logLOS        0.0264496  0.0112407   2.353  0.01862 *  
## logRiskscore  0.6937479  0.0123953  55.968  < 2e-16 ***
## Under65      -0.0280057  0.0312892  -0.895  0.37076    
## DRGMed.NoC   -0.0370186  0.0201685  -1.835  0.06644 .  
## DRGOtherMED   0.0774743  0.0277714   2.790  0.00528 ** 
## DRGOtherSURG  0.0177706  0.0349618   0.508  0.61125    
## DRGUNGROUP   -0.0111276  0.0778191  -0.143  0.88630    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37785  on 50081  degrees of freedom
## Residual deviance: 33806  on 50070  degrees of freedom
## AIC: 33830
## 
## Number of Fisher Scoring iterations: 5

Now remove ER.

glmprobit <- glm(Readmission.Status ~ Gender + Race + Age + logLOS + logRiskscore + Under65 + DRG, data=train, family = binomial(link="probit")) 

summary(glmprobit)
## 
## Call:
## glm(formula = Readmission.Status ~ Gender + Race + Age + logLOS + 
##     logRiskscore + Under65 + DRG, family = binomial(link = "probit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2650  -0.5732  -0.3967  -0.2450   3.9438  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.4551391  0.0729745 -19.940  < 2e-16 ***
## GenderM       0.0122698  0.0154363   0.795  0.42669    
## RaceBlack     0.0185940  0.0245811   0.756  0.44939    
## Age          -0.0040694  0.0008788  -4.631 3.65e-06 ***
## logLOS        0.0263961  0.0112398   2.348  0.01885 *  
## logRiskscore  0.6937771  0.0123950  55.973  < 2e-16 ***
## Under65      -0.0279778  0.0312889  -0.894  0.37123    
## DRGMed.NoC   -0.0370394  0.0201684  -1.837  0.06628 .  
## DRGOtherMED   0.0775242  0.0277709   2.792  0.00525 ** 
## DRGOtherSURG  0.0177182  0.0349616   0.507  0.61230    
## DRGUNGROUP   -0.0111023  0.0778154  -0.143  0.88655    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37785  on 50081  degrees of freedom
## Residual deviance: 33807  on 50071  degrees of freedom
## AIC: 33829
## 
## Number of Fisher Scoring iterations: 5

Now remove Gender

glmprobit <- glm(Readmission.Status ~  Race + Age + logLOS + logRiskscore + Under65 + DRG, data=train, family = binomial(link="probit")) 

summary(glmprobit)
## 
## Call:
## glm(formula = Readmission.Status ~ Race + Age + logLOS + logRiskscore + 
##     Under65 + DRG, family = binomial(link = "probit"), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2610  -0.5734  -0.3968  -0.2450   3.9488  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.4454061  0.0719427 -20.091  < 2e-16 ***
## RaceBlack     0.0182451  0.0245749   0.742  0.45783    
## Age          -0.0041287  0.0008756  -4.715 2.41e-06 ***
## logLOS        0.0264271  0.0112398   2.351  0.01871 *  
## logRiskscore  0.6939005  0.0123935  55.989  < 2e-16 ***
## Under65      -0.0288084  0.0312723  -0.921  0.35694    
## DRGMed.NoC   -0.0369944  0.0201681  -1.834  0.06661 .  
## DRGOtherMED   0.0773078  0.0277702   2.784  0.00537 ** 
## DRGOtherSURG  0.0175849  0.0349617   0.503  0.61498    
## DRGUNGROUP   -0.0109392  0.0778137  -0.141  0.88820    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37785  on 50081  degrees of freedom
## Residual deviance: 33807  on 50072  degrees of freedom
## AIC: 33827
## 
## Number of Fisher Scoring iterations: 5

Rather than remove DRGUNGROUP at this stage, it will be combined with DRGOtherSURG. The difference in significance is likely due to sample sizes.

readmission2<-readmission 

var <- "DRG"
var.levels <- levels(readmission2[,var])
readmission2[,var] <- mapvalues(readmission2[,var],var.levels,c("DRGbase","Med.NoC","OtherMED","OSUngroup","OSUngroup"))
#Relevel
table <- as.data.frame(table(readmission2[,var]))
  max <- which.max(table[,2])
  level.name <- as.character(table[max,1])
  readmission2[,var] <- relevel(readmission2[,var], ref = level.name)

table(readmission2[,var])
## 
##   DRGbase   Med.NoC  OtherMED OSUngroup 
##     45121     12310      5357      3988
readmission <- readmission2
train <- readmission[partition,]
test <- readmission[-partition,]
glmprobit <- glm(Readmission.Status ~ Race + Age + logLOS + logRiskscore + Under65 + DRG, data=train, family = binomial(link="probit")) 

summary(glmprobit)
## 
## Call:
## glm(formula = Readmission.Status ~ Race + Age + logLOS + logRiskscore + 
##     Under65 + DRG, family = binomial(link = "probit"), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2610  -0.5734  -0.3967  -0.2450   3.9487  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.4453515  0.0719416 -20.091  < 2e-16 ***
## RaceBlack     0.0181893  0.0245749   0.740  0.45920    
## Age          -0.0041283  0.0008756  -4.715 2.42e-06 ***
## logLOS        0.0264126  0.0112376   2.350  0.01875 *  
## logRiskscore  0.6938423  0.0123924  55.989  < 2e-16 ***
## Under65      -0.0288134  0.0312722  -0.921  0.35686    
## DRGMed.NoC   -0.0369916  0.0201679  -1.834  0.06663 .  
## DRGOtherMED   0.0773120  0.0277699   2.784  0.00537 ** 
## DRGOSUngroup  0.0130269  0.0322735   0.404  0.68648    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37785  on 50081  degrees of freedom
## Residual deviance: 33807  on 50073  degrees of freedom
## AIC: 33825
## 
## Number of Fisher Scoring iterations: 5

Next to go is Med.NoC with the hightest p-value.

readmission2<-readmission 

var <- "DRG"
var.levels <- levels(readmission2[,var])
readmission2[,var] <- mapvalues(readmission2[,var],var.levels,c("DRGbase","DRGbase","OtherMED","OSUngroup"))
#Relevel
table <- as.data.frame(table(readmission2[,var]))
  max <- which.max(table[,2])
  level.name <- as.character(table[max,1])
  readmission2[,var] <- relevel(readmission2[,var], ref = level.name)

table(readmission2[,var])
## 
##   DRGbase  OtherMED OSUngroup 
##     57431      5357      3988
readmission <- readmission2
train <- readmission[partition,]
test <- readmission[-partition,]
glmprobit <- glm(Readmission.Status ~ Race + Age + logLOS + logRiskscore + Under65 + DRG, data=train, family = binomial(link="probit")) 

summary(glmprobit)
## 
## Call:
## glm(formula = Readmission.Status ~ Race + Age + logLOS + logRiskscore + 
##     Under65 + DRG, family = binomial(link = "probit"), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2609  -0.5737  -0.3964  -0.2451   3.9543  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.4522233  0.0718467 -20.213  < 2e-16 ***
## RaceBlack     0.0177452  0.0245727   0.722  0.47020    
## Age          -0.0041311  0.0008755  -4.718 2.38e-06 ***
## logLOS        0.0261069  0.0112357   2.324  0.02015 *  
## logRiskscore  0.6935189  0.0123905  55.972  < 2e-16 ***
## Under65      -0.0286239  0.0312711  -0.915  0.36001    
## DRGOtherMED   0.0852906  0.0274319   3.109  0.00188 ** 
## DRGOSUngroup  0.0209850  0.0319836   0.656  0.51175    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37785  on 50081  degrees of freedom
## Residual deviance: 33811  on 50074  degrees of freedom
## AIC: 33827
## 
## Number of Fisher Scoring iterations: 5

Now remove Race.

readmission <- readmission2
train <- readmission[partition,]
test <- readmission[-partition,]
glmprobit <- glm(Readmission.Status ~ Age + logLOS + logRiskscore + Under65 + DRG, data=train, family = binomial(link="probit")) 

summary(glmprobit)
## 
## Call:
## glm(formula = Readmission.Status ~ Age + logLOS + logRiskscore + 
##     Under65 + DRG, family = binomial(link = "probit"), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2639  -0.5736  -0.3965  -0.2452   3.9508  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.4482959  0.0716476 -20.214  < 2e-16 ***
## Age          -0.0041628  0.0008745  -4.760 1.94e-06 ***
## logLOS        0.0261197  0.0112356   2.325  0.02009 *  
## logRiskscore  0.6936603  0.0123892  55.989  < 2e-16 ***
## Under65      -0.0273565  0.0312158  -0.876  0.38083    
## DRGOtherMED   0.0854442  0.0274312   3.115  0.00184 ** 
## DRGOSUngroup  0.0207800  0.0319832   0.650  0.51588    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37785  on 50081  degrees of freedom
## Residual deviance: 33811  on 50075  degrees of freedom
## AIC: 33825
## 
## Number of Fisher Scoring iterations: 5

Now remove Under65.

readmission <- readmission2
train <- readmission[partition,]
test <- readmission[-partition,]
glmprobit <- glm(Readmission.Status ~ Age + logLOS + logRiskscore + DRG, data=train, family = binomial(link="probit")) 

summary(glmprobit)
## 
## Call:
## glm(formula = Readmission.Status ~ Age + logLOS + logRiskscore + 
##     DRG, family = binomial(link = "probit"), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2664  -0.5740  -0.3966  -0.2453   3.9512  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.4957246  0.0470470 -31.792  < 2e-16 ***
## Age          -0.0035792  0.0005678  -6.303 2.91e-10 ***
## logLOS        0.0262177  0.0112352   2.334  0.01962 *  
## logRiskscore  0.6931107  0.0123754  56.007  < 2e-16 ***
## DRGOtherMED   0.0852171  0.0274282   3.107  0.00189 ** 
## DRGOSUngroup  0.0209568  0.0319819   0.655  0.51229    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37785  on 50081  degrees of freedom
## Residual deviance: 33812  on 50076  degrees of freedom
## AIC: 33824
## 
## Number of Fisher Scoring iterations: 5

Everything is now significant at the 10% level and we stop. If we wanted a lower significance level for decision making, the process would continue.