The data

claims <- read.csv("ClaimsData.csv")
str(claims)
## 'data.frame':    458005 obs. of  16 variables:
##  $ age              : int  85 59 67 52 67 68 75 70 67 67 ...
##  $ alzheimers       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ arthritis        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cancer           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ copd             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ depression       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ diabetes         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ heart.failure    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ihd              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ kidney           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ osteoporosis     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ stroke           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ reimbursement2008: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ bucket2008       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ reimbursement2009: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ bucket2009       : int  1 1 1 1 1 1 1 1 1 1 ...

bucket2009 is the dependent variable and is the cost bucket. Cost buckets are:

etc.

Split into testing and training

library(caTools)
## Warning: package 'caTools' was built under R version 3.1.3
set.seed(88)
spl <- sample.split(claims$bucket2009, SplitRatio=0.6)
claimsTrain <- subset(claims, spl == TRUE)
claimsTest <- subset(claims, spl == FALSE)

Average age of patients in training set and proportion of at least 1 diagnosis code for diabetes

(mean(claimsTrain$age))
## [1] 72.63773
table(claimsTrain$diabetes) / nrow(claimsTrain)
## 
##         0         1 
## 0.6191017 0.3808983

Baseline accuracy - cost in 2009 is same as 2009

confmat <- table(claimsTest$bucket2009, claimsTest$bucket2008)
N <- nrow(claimsTest)
sum(diag(confmat)) / N
## [1] 0.6838135

Penalty matrix:

The x-axis is the predicted cost, and the y-axis is the actual cost.

penalmat <- matrix(c(0,1,2,3,4,
                     2,0,1,2,3,
                     4,2,0,1,2,
                     6,4,2,0,1,
                     8,6,4,2,0), byrow=TRUE, nrow=5)

Penalty Error: Note that * is a element-by-element multiplication in R:

sum(as.matrix(confmat) * penalmat) / N
## [1] 0.7386055

Accuracy of a baseline model that predicts cost bucket of 1 for everyone:

m <- table(claimsTest$bucket2009, rep(1, nrow(claimsTest)))
m[1,1] / sum(m)
## [1] 0.67127

Penalty error of such baseline model:

sum(m * penalmat[,1]) / N
## [1] 1.044301

CART to predict healthcare cost. CP was from previous cross validation

library(rpart)
## Warning: package 'rpart' was built under R version 3.1.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.1.3
claimsTree <- rpart(bucket2009 ~ age + alzheimers + arthritis + cancer + copd +
                      depression + diabetes + heart.failure + ihd +
                      kidney + osteoporosis + stroke + reimbursement2008,
                    data=claimsTrain, method="class", cp=0.00005)
prp(claimsTree)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

Make predictions against test data and calculate accuracy

predictTest <- predict(claimsTree, newdata=claimsTest, type="class")
(confmat <- table(claimsTest$bucket2009, predictTest))
##    predictTest
##          1      2      3      4      5
##   1 114141   8610    124    103      0
##   2  18409  16102    187    142      0
##   3   8027   8146    118     99      0
##   4   3099   4584     53    201      0
##   5    351    657      4     45      0
N <- nrow(claimsTest)
sum(diag(confmat)) / N
## [1] 0.7126669

Penalty Error

sum(as.matrix(confmat) * penalmat) / N
## [1] 0.7578902

To tell rpart to use penalty matrix:

claimsTree <- rpart(bucket2009 ~ age + alzheimers + arthritis + cancer + copd +
                      depression + diabetes + heart.failure + ihd +
                      kidney + osteoporosis + stroke + reimbursement2008,
                    data=claimsTrain, method="class", cp=0.00005,
                    parms=list(loss=penalmat))

New accuracy and penalty error

predictTest <- predict(claimsTree, newdata=claimsTest, type="class")
(confmat <- table(claimsTest$bucket2009, predictTest))
##    predictTest
##         1     2     3     4     5
##   1 94310 25295  3087   286     0
##   2  7176 18942  8079   643     0
##   3  3590  7706  4692   401     1
##   4  1304  3193  2803   636     1
##   5   135   356   408   156     2
(sum(diag(confmat)) / N)
## [1] 0.6472746
sum(as.matrix(confmat) * penalmat) / N
## [1] 0.6418161