First Dataset is available from CA Hospital Mortality Rates for 2012-2013..

Second Dataset is availabe from Hospital Profitability for 2009-2013.

Question: Can we predict hospital ratings based on risk adjusted mortality rates, number of deaths, number of cases, medical procedures performed, medical conditions treated and hospital profitability for 2012-2013?

Dataset Wrangling.

data <- read.csv("California_Hospital_Inpatient_Mortality_Rates_and_Quality_Ratings__2012-2013.csv",sep=",",header=TRUE)
df <- tbl_df(data)
df_clean <- df[which(is.na(df$X..of.Cases)==F),]
df_clean$Procedure.Condition <- gsub("Acute Stroke .*","Acute Stroke",df_clean$Procedure.Condition)
df_clean$Procedure.Condition <- factor(df_clean$Procedure.Condition)
df_clean <- df_clean %>% 
  mutate(Medical_Category = ifelse(grepl("Repair",Procedure.Condition) | grepl("Endarterectomy",Procedure.Condition) | grepl("Craniotomy",Procedure.Condition) | grepl("Resection",Procedure.Condition) | grepl("PCI",Procedure.Condition),               "Procedure", "Condition"))
glimpse(df_clean)
## Observations: 6,165
## Variables: 13
## $ Year                         <int> 2012, 2012, 2012, 2012, 2012, 201...
## $ County                       <fctr> Alameda, Alameda, Alameda, Alame...
## $ Hospital                     <fctr> Alameda Hospital, Alameda Hospit...
## $ OSHPDID                      <int> 106010735, 106010735, 106010735, ...
## $ Procedure.Condition          <fctr> AMI, Acute Stroke, GI Hemorrhage...
## $ Risk.Adjusted.Mortality.Rate <dbl> 5.5, 2.3, 1.3, 0.0, 25.1, 2.6, 8....
## $ X..of.Deaths                 <int> 4, 1, 1, 0, 5, 5, 7, 0, 5, 18, 8,...
## $ X..of.Cases                  <int> 41, 60, 65, 5, 11, 139, 72, 35, 1...
## $ Hospital.Ratings             <fctr> As Expected, As Expected, As Exp...
## $ Longitude                    <dbl> -122.2536, -122.2536, -122.2536, ...
## $ Latitude                     <dbl> 37.76295, 37.76295, 37.76295, 37....
## $ location1                    <fctr> (37.762953, -122.25362), (37.762...
## $ Medical_Category             <chr> "Condition", "Condition", "Condit...

Summary from EDA.

Split the Dataset into train and test sets.

set.seed(101) 
sample = sample.split(df_clean$Hospital.Ratings, SplitRatio = .7)
train = subset(df_clean, sample == TRUE)
test_original = subset(df_clean, sample == FALSE)
test <- subset(test_original, select = -Hospital.Ratings)

Decision Trees: Prediction of Hospital Ratings.

Reading sources:

# Build the decision tree
tree0 <- rpart(Hospital.Ratings ~ Procedure.Condition + Risk.Adjusted.Mortality.Rate + X..of.Cases + X..of.Deaths + Year, data = train, method = "class")
printcp(tree0)
## 
## Classification tree:
## rpart(formula = Hospital.Ratings ~ Procedure.Condition + Risk.Adjusted.Mortality.Rate + 
##     X..of.Cases + X..of.Deaths + Year, data = train, method = "class")
## 
## Variables actually used in tree construction:
## [1] Procedure.Condition          Risk.Adjusted.Mortality.Rate
## [3] X..of.Cases                  X..of.Deaths                
## 
## Root node error: 258/4316 = 0.059778
## 
## n= 4316 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.020672      0   1.00000 1.00000 0.060368
## 2 0.017442      7   0.82946 0.94574 0.058808
## 3 0.013566      9   0.79457 0.91085 0.057777
## 4 0.010336     11   0.76744 0.89535 0.057311
## 5 0.010000     22   0.64341 0.87597 0.056723
fancyRpartPlot(tree0)

set.seed(20)
tree1 <- rpart(Hospital.Ratings ~ Procedure.Condition + Risk.Adjusted.Mortality.Rate + X..of.Cases + X..of.Deaths + Year, data = train, method = "class", control=rpart.control(cp=0.0001,minsplit = 50))

# cp determines when the splitting up of the decision tree stops.
# minsplit determines the minimum amount of observations in a leaf of the tree.

printcp(tree1)
## 
## Classification tree:
## rpart(formula = Hospital.Ratings ~ Procedure.Condition + Risk.Adjusted.Mortality.Rate + 
##     X..of.Cases + X..of.Deaths + Year, data = train, method = "class", 
##     control = rpart.control(cp = 1e-04, minsplit = 50))
## 
## Variables actually used in tree construction:
## [1] Procedure.Condition          Risk.Adjusted.Mortality.Rate
## [3] X..of.Cases                  X..of.Deaths                
## 
## Root node error: 258/4316 = 0.059778
## 
## n= 4316 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.020672      0   1.00000 1.00000 0.060368
## 2 0.017442      6   0.85659 0.99612 0.060258
## 3 0.000100      8   0.82171 0.95736 0.059147
plotcp(tree1)

min(tree1$cptable[,"xerror"])
## [1] 0.9573643
num <- which.min(tree1$cptable[,"xerror"])
num
## 3 
## 3
tree1$cptable[num,]
##         CP     nsplit  rel error     xerror       xstd 
## 0.00010000 8.00000000 0.82170543 0.95736434 0.05914689
cp.choice<-tree1$cptable[num,"CP"]
cp.choice
## [1] 1e-04
pruned.tree<-prune(tree1, cp=cp.choice)
#pruned.tree
fancyRpartPlot(pruned.tree)

Making Predictions.

# Make predictions on the test set
prediction <- predict(pruned.tree, test, type = "class")
# confusion matrix
cm <- as.matrix(table(Actual = test_original$Hospital.Ratings,Predicted = prediction))
cm
##              Predicted
## Actual        As Expected Better Worse
##   As Expected        1725      3    11
##   Better               39      7     1
##   Worse                51      0    12

Accuracy. A key metric to start with is the overall classification accuracy. It is defined as the fraction of instances that are correctly classified.

n = sum(cm) # number of instances
nc = nrow(cm) # number of classes
diag = diag(cm) # number of correctly classified instances per class 
rowsums = apply(cm, 1, sum) # number of instances per class
colsums = apply(cm, 2, sum) # number of predictions per class
p = rowsums / n # distribution of instances over the actual classes
q = colsums / n # distribution of instances over the predicted classes

accuracy = sum(diag) / n 
accuracy
## [1] 0.9432125

Per-class Precision, Recall, and F-1.

precision = diag / colsums 
recall = diag / rowsums 
f1 = 2 * precision * recall / (precision + recall) 
data.frame(precision, recall, f1) 
##             precision    recall        f1
## As Expected 0.9504132 0.9919494 0.9707372
## Better      0.7000000 0.1489362 0.2456140
## Worse       0.5000000 0.1904762 0.2758621

Decision Trees: Predict Hospital Ratings Using Random Forests.

fit <- randomForest(Hospital.Ratings ~ Procedure.Condition + Risk.Adjusted.Mortality.Rate + X..of.Cases + X..of.Deaths + Year, data=train,importance=TRUE,ntree=1000)
print(fit) # view results 
## 
## Call:
##  randomForest(formula = Hospital.Ratings ~ Procedure.Condition +      Risk.Adjusted.Mortality.Rate + X..of.Cases + X..of.Deaths +      Year, data = train, importance = TRUE, ntree = 1000) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 4.26%
## Confusion matrix:
##             As Expected Better Worse class.error
## As Expected        4041     12     5 0.004189256
## Better               72     37     2 0.666666667
## Worse                91      2    54 0.632653061
importance(fit) # importance of each predictor
##                              As Expected    Better     Worse
## Procedure.Condition           170.206825 33.025166 18.880733
## Risk.Adjusted.Mortality.Rate  109.477324 36.812502 87.272882
## X..of.Cases                    81.576463 54.956765 22.974266
## X..of.Deaths                   48.746630  3.586386 70.795704
## Year                           -1.519326 -4.437809  1.813967
##                              MeanDecreaseAccuracy MeanDecreaseGini
## Procedure.Condition                    175.808902         88.36502
## Risk.Adjusted.Mortality.Rate           118.991829        164.79196
## X..of.Cases                             88.331588        120.66040
## X..of.Deaths                            53.960739         84.61701
## Year                                    -2.206323         13.08606
varImpPlot(fit)

prediction1 <- predict(fit, test)
cm <- as.matrix(table(Actual = test_original$Hospital.Ratings,Predicted = prediction1))
cm
##              Predicted
## Actual        As Expected Better Worse
##   As Expected        1732      2     5
##   Better               33     13     1
##   Worse                38      0    25
n = sum(cm) # number of instances
nc = nrow(cm) # number of classes
diag = diag(cm) # number of correctly classified instances per class 
rowsums = apply(cm, 1, sum) # number of instances per class
colsums = apply(cm, 2, sum) # number of predictions per class
p = rowsums / n # distribution of instances over the actual classes
q = colsums / n # distribution of instances over the predicted classes

accuracy = sum(diag) / n 
accuracy
## [1] 0.9572742
precision = diag / colsums 
recall = diag / rowsums 
f1 = 2 * precision * recall / (precision + recall) 
data.frame(precision, recall, f1) 
##             precision    recall        f1
## As Expected 0.9606212 0.9959747 0.9779785
## Better      0.8666667 0.2765957 0.4193548
## Worse       0.8064516 0.3968254 0.5319149