The data
claims <- read.csv("ClaimsData.csv")
str(claims)
## 'data.frame': 458005 obs. of 16 variables:
## $ age : int 85 59 67 52 67 68 75 70 67 67 ...
## $ alzheimers : int 0 0 0 0 0 0 0 0 0 0 ...
## $ arthritis : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cancer : int 0 0 0 0 0 0 0 0 0 0 ...
## $ copd : int 0 0 0 0 0 0 0 0 0 0 ...
## $ depression : int 0 0 0 0 0 0 0 0 0 0 ...
## $ diabetes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ heart.failure : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ihd : int 0 0 0 0 0 0 0 0 0 0 ...
## $ kidney : int 0 0 0 0 0 0 0 0 0 0 ...
## $ osteoporosis : int 0 0 0 0 0 0 0 0 0 0 ...
## $ stroke : int 0 0 0 0 0 0 0 0 0 0 ...
## $ reimbursement2008: int 0 0 0 0 0 0 0 0 0 0 ...
## $ bucket2008 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ reimbursement2009: int 0 0 0 0 0 0 0 0 0 0 ...
## $ bucket2009 : int 1 1 1 1 1 1 1 1 1 1 ...
bucket2009 is the dependent variable and is the cost bucket. Cost buckets are:
etc.
Split into testing and training
library(caTools)
## Warning: package 'caTools' was built under R version 3.1.3
set.seed(88)
spl <- sample.split(claims$bucket2009, SplitRatio=0.6)
claimsTrain <- subset(claims, spl == TRUE)
claimsTest <- subset(claims, spl == FALSE)
Average age of patients in training set and proportion of at least 1 diagnosis code for diabetes
(mean(claimsTrain$age))
## [1] 72.63773
table(claimsTrain$diabetes) / nrow(claimsTrain)
##
## 0 1
## 0.6191017 0.3808983
Baseline accuracy - cost in 2009 is same as 2009
confmat <- table(claimsTest$bucket2009, claimsTest$bucket2008)
N <- nrow(claimsTest)
sum(diag(confmat)) / N
## [1] 0.6838135
Penalty matrix:
The x-axis is the predicted cost, and the y-axis is the actual cost.
penalmat <- matrix(c(0,1,2,3,4,
2,0,1,2,3,
4,2,0,1,2,
6,4,2,0,1,
8,6,4,2,0), byrow=TRUE, nrow=5)
Penalty Error: Note that * is a element-by-element multiplication in R:
sum(as.matrix(confmat) * penalmat) / N
## [1] 0.7386055
Accuracy of a baseline model that predicts cost bucket of 1 for everyone:
m <- table(claimsTest$bucket2009, rep(1, nrow(claimsTest)))
m[1,1] / sum(m)
## [1] 0.67127
Penalty error of such baseline model:
sum(m * penalmat[,1]) / N
## [1] 1.044301
CART to predict healthcare cost. CP was from previous cross validation
library(rpart)
## Warning: package 'rpart' was built under R version 3.1.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.1.3
claimsTree <- rpart(bucket2009 ~ age + alzheimers + arthritis + cancer + copd +
depression + diabetes + heart.failure + ihd +
kidney + osteoporosis + stroke + reimbursement2008,
data=claimsTrain, method="class", cp=0.00005)
prp(claimsTree)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
Make predictions against test data and calculate accuracy
predictTest <- predict(claimsTree, newdata=claimsTest, type="class")
(confmat <- table(claimsTest$bucket2009, predictTest))
## predictTest
## 1 2 3 4 5
## 1 114141 8610 124 103 0
## 2 18409 16102 187 142 0
## 3 8027 8146 118 99 0
## 4 3099 4584 53 201 0
## 5 351 657 4 45 0
N <- nrow(claimsTest)
sum(diag(confmat)) / N
## [1] 0.7126669
Penalty Error
sum(as.matrix(confmat) * penalmat) / N
## [1] 0.7578902
To tell rpart to use penalty matrix:
claimsTree <- rpart(bucket2009 ~ age + alzheimers + arthritis + cancer + copd +
depression + diabetes + heart.failure + ihd +
kidney + osteoporosis + stroke + reimbursement2008,
data=claimsTrain, method="class", cp=0.00005,
parms=list(loss=penalmat))
New accuracy and penalty error
predictTest <- predict(claimsTree, newdata=claimsTest, type="class")
(confmat <- table(claimsTest$bucket2009, predictTest))
## predictTest
## 1 2 3 4 5
## 1 94310 25295 3087 286 0
## 2 7176 18942 8079 643 0
## 3 3590 7706 4692 401 1
## 4 1304 3193 2803 636 1
## 5 135 356 408 156 2
(sum(diag(confmat)) / N)
## [1] 0.6472746
sum(as.matrix(confmat) * penalmat) / N
## [1] 0.6418161