First Dataset is available from CA Hospital Mortality Rates for 2012-2013..
Second Dataset is availabe from Hospital Profitability for 2009-2013.
Question: Can we predict hospital ratings based on risk adjusted mortality rates, number of deaths, number of cases, medical procedures performed, medical conditions treated and hospital profitability for 2012-2013?
Dataset Wrangling.
data <- read.csv("California_Hospital_Inpatient_Mortality_Rates_and_Quality_Ratings__2012-2013.csv",sep=",",header=TRUE)
df <- tbl_df(data)
df_clean <- df[which(is.na(df$X..of.Cases)==F),]
df_clean$Procedure.Condition <- gsub("Acute Stroke .*","Acute Stroke",df_clean$Procedure.Condition)
df_clean$Procedure.Condition <- factor(df_clean$Procedure.Condition)
df_clean <- df_clean %>%
mutate(Medical_Category = ifelse(grepl("Repair",Procedure.Condition) | grepl("Endarterectomy",Procedure.Condition) | grepl("Craniotomy",Procedure.Condition) | grepl("Resection",Procedure.Condition) | grepl("PCI",Procedure.Condition), "Procedure", "Condition"))
glimpse(df_clean)
## Observations: 6,165
## Variables: 13
## $ Year <int> 2012, 2012, 2012, 2012, 2012, 201...
## $ County <fctr> Alameda, Alameda, Alameda, Alame...
## $ Hospital <fctr> Alameda Hospital, Alameda Hospit...
## $ OSHPDID <int> 106010735, 106010735, 106010735, ...
## $ Procedure.Condition <fctr> AMI, Acute Stroke, GI Hemorrhage...
## $ Risk.Adjusted.Mortality.Rate <dbl> 5.5, 2.3, 1.3, 0.0, 25.1, 2.6, 8....
## $ X..of.Deaths <int> 4, 1, 1, 0, 5, 5, 7, 0, 5, 18, 8,...
## $ X..of.Cases <int> 41, 60, 65, 5, 11, 139, 72, 35, 1...
## $ Hospital.Ratings <fctr> As Expected, As Expected, As Exp...
## $ Longitude <dbl> -122.2536, -122.2536, -122.2536, ...
## $ Latitude <dbl> 37.76295, 37.76295, 37.76295, 37....
## $ location1 <fctr> (37.762953, -122.25362), (37.762...
## $ Medical_Category <chr> "Condition", "Condition", "Condit...
Summary from EDA.
Higher the risk adjusted mortality rate, worse the hospital ratings.
The Risk.Adjusted.Mortality.Rate is lower for PCI procedure and for heart failure, AMI and GI Hemorrhage conditions.
Hospital ratings are worse for PCI procedure and for AMI, GI Hemorrhage and heart failure conditions.
Split the Dataset into train and test sets.
set.seed(101)
sample = sample.split(df_clean$Hospital.Ratings, SplitRatio = .7)
train = subset(df_clean, sample == TRUE)
test_original = subset(df_clean, sample == FALSE)
test <- subset(test_original, select = -Hospital.Ratings)
Reading sources:
# Build the decision tree
tree0 <- rpart(Hospital.Ratings ~ Procedure.Condition + Risk.Adjusted.Mortality.Rate + X..of.Cases + X..of.Deaths + Year, data = train, method = "class")
printcp(tree0)
##
## Classification tree:
## rpart(formula = Hospital.Ratings ~ Procedure.Condition + Risk.Adjusted.Mortality.Rate +
## X..of.Cases + X..of.Deaths + Year, data = train, method = "class")
##
## Variables actually used in tree construction:
## [1] Procedure.Condition Risk.Adjusted.Mortality.Rate
## [3] X..of.Cases X..of.Deaths
##
## Root node error: 258/4316 = 0.059778
##
## n= 4316
##
## CP nsplit rel error xerror xstd
## 1 0.020672 0 1.00000 1.00000 0.060368
## 2 0.017442 7 0.82946 0.94574 0.058808
## 3 0.013566 9 0.79457 0.91085 0.057777
## 4 0.010336 11 0.76744 0.89535 0.057311
## 5 0.010000 22 0.64341 0.87597 0.056723
fancyRpartPlot(tree0)
set.seed(20)
tree1 <- rpart(Hospital.Ratings ~ Procedure.Condition + Risk.Adjusted.Mortality.Rate + X..of.Cases + X..of.Deaths + Year, data = train, method = "class", control=rpart.control(cp=0.0001,minsplit = 50))
# cp determines when the splitting up of the decision tree stops.
# minsplit determines the minimum amount of observations in a leaf of the tree.
printcp(tree1)
##
## Classification tree:
## rpart(formula = Hospital.Ratings ~ Procedure.Condition + Risk.Adjusted.Mortality.Rate +
## X..of.Cases + X..of.Deaths + Year, data = train, method = "class",
## control = rpart.control(cp = 1e-04, minsplit = 50))
##
## Variables actually used in tree construction:
## [1] Procedure.Condition Risk.Adjusted.Mortality.Rate
## [3] X..of.Cases X..of.Deaths
##
## Root node error: 258/4316 = 0.059778
##
## n= 4316
##
## CP nsplit rel error xerror xstd
## 1 0.020672 0 1.00000 1.00000 0.060368
## 2 0.017442 6 0.85659 0.99612 0.060258
## 3 0.000100 8 0.82171 0.95736 0.059147
plotcp(tree1)
min(tree1$cptable[,"xerror"])
## [1] 0.9573643
num <- which.min(tree1$cptable[,"xerror"])
num
## 3
## 3
tree1$cptable[num,]
## CP nsplit rel error xerror xstd
## 0.00010000 8.00000000 0.82170543 0.95736434 0.05914689
cp.choice<-tree1$cptable[num,"CP"]
cp.choice
## [1] 1e-04
pruned.tree<-prune(tree1, cp=cp.choice)
#pruned.tree
fancyRpartPlot(pruned.tree)
Making Predictions.
# Make predictions on the test set
prediction <- predict(pruned.tree, test, type = "class")
# confusion matrix
cm <- as.matrix(table(Actual = test_original$Hospital.Ratings,Predicted = prediction))
cm
## Predicted
## Actual As Expected Better Worse
## As Expected 1725 3 11
## Better 39 7 1
## Worse 51 0 12
Accuracy. A key metric to start with is the overall classification accuracy. It is defined as the fraction of instances that are correctly classified.
n = sum(cm) # number of instances
nc = nrow(cm) # number of classes
diag = diag(cm) # number of correctly classified instances per class
rowsums = apply(cm, 1, sum) # number of instances per class
colsums = apply(cm, 2, sum) # number of predictions per class
p = rowsums / n # distribution of instances over the actual classes
q = colsums / n # distribution of instances over the predicted classes
accuracy = sum(diag) / n
accuracy
## [1] 0.9432125
Per-class Precision, Recall, and F-1.
precision = diag / colsums
recall = diag / rowsums
f1 = 2 * precision * recall / (precision + recall)
data.frame(precision, recall, f1)
## precision recall f1
## As Expected 0.9504132 0.9919494 0.9707372
## Better 0.7000000 0.1489362 0.2456140
## Worse 0.5000000 0.1904762 0.2758621
fit <- randomForest(Hospital.Ratings ~ Procedure.Condition + Risk.Adjusted.Mortality.Rate + X..of.Cases + X..of.Deaths + Year, data=train,importance=TRUE,ntree=1000)
print(fit) # view results
##
## Call:
## randomForest(formula = Hospital.Ratings ~ Procedure.Condition + Risk.Adjusted.Mortality.Rate + X..of.Cases + X..of.Deaths + Year, data = train, importance = TRUE, ntree = 1000)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 4.26%
## Confusion matrix:
## As Expected Better Worse class.error
## As Expected 4041 12 5 0.004189256
## Better 72 37 2 0.666666667
## Worse 91 2 54 0.632653061
importance(fit) # importance of each predictor
## As Expected Better Worse
## Procedure.Condition 170.206825 33.025166 18.880733
## Risk.Adjusted.Mortality.Rate 109.477324 36.812502 87.272882
## X..of.Cases 81.576463 54.956765 22.974266
## X..of.Deaths 48.746630 3.586386 70.795704
## Year -1.519326 -4.437809 1.813967
## MeanDecreaseAccuracy MeanDecreaseGini
## Procedure.Condition 175.808902 88.36502
## Risk.Adjusted.Mortality.Rate 118.991829 164.79196
## X..of.Cases 88.331588 120.66040
## X..of.Deaths 53.960739 84.61701
## Year -2.206323 13.08606
varImpPlot(fit)
prediction1 <- predict(fit, test)
cm <- as.matrix(table(Actual = test_original$Hospital.Ratings,Predicted = prediction1))
cm
## Predicted
## Actual As Expected Better Worse
## As Expected 1732 2 5
## Better 33 13 1
## Worse 38 0 25
n = sum(cm) # number of instances
nc = nrow(cm) # number of classes
diag = diag(cm) # number of correctly classified instances per class
rowsums = apply(cm, 1, sum) # number of instances per class
colsums = apply(cm, 2, sum) # number of predictions per class
p = rowsums / n # distribution of instances over the actual classes
q = colsums / n # distribution of instances over the predicted classes
accuracy = sum(diag) / n
accuracy
## [1] 0.9572742
precision = diag / colsums
recall = diag / rowsums
f1 = 2 * precision * recall / (precision + recall)
data.frame(precision, recall, f1)
## precision recall f1
## As Expected 0.9606212 0.9959747 0.9779785
## Better 0.8666667 0.2765957 0.4193548
## Worse 0.8064516 0.3968254 0.5319149