setwd('C:/Users/daria.alekseeva/Documents/Edx/Trees/')
# Read in the data
Claims = read.csv("ClaimsData.csv/ClaimsData.csv")
str(Claims)
## 'data.frame': 458005 obs. of 16 variables:
## $ age : int 85 59 67 52 67 68 75 70 67 67 ...
## $ alzheimers : int 0 0 0 0 0 0 0 0 0 0 ...
## $ arthritis : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cancer : int 0 0 0 0 0 0 0 0 0 0 ...
## $ copd : int 0 0 0 0 0 0 0 0 0 0 ...
## $ depression : int 0 0 0 0 0 0 0 0 0 0 ...
## $ diabetes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ heart.failure : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ihd : int 0 0 0 0 0 0 0 0 0 0 ...
## $ kidney : int 0 0 0 0 0 0 0 0 0 0 ...
## $ osteoporosis : int 0 0 0 0 0 0 0 0 0 0 ...
## $ stroke : int 0 0 0 0 0 0 0 0 0 0 ...
## $ reimbursement2008: int 0 0 0 0 0 0 0 0 0 0 ...
## $ bucket2008 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ reimbursement2009: int 0 0 0 0 0 0 0 0 0 0 ...
## $ bucket2009 : int 1 1 1 1 1 1 1 1 1 1 ...
# Percentage of patients in each cost bucket
table(Claims$bucket2009)/nrow(Claims)
##
## 1 2 3 4 5
## 0.671267781 0.190170413 0.089466272 0.043324855 0.005770679
# Split the data
library(caTools)
set.seed(88)
spl = sample.split(Claims$bucket2009, SplitRatio = 0.6)
ClaimsTrain = subset(Claims, spl==TRUE)
ClaimsTest = subset(Claims, spl==FALSE)
What is the average age of patients in the training set, ClaimsTrain?
mean(ClaimsTrain$age)
## [1] 72.63773
What proportion of people in the training set (ClaimsTrain) had at least one diagnosis code for diabetes?
nrow(ClaimsTrain[ClaimsTrain$diabetes>=1,]) / nrow(ClaimsTrain)
## [1] 0.3808983
table(ClaimsTest\(bucket2009, ClaimsTest\)bucket2008)
(110138 + 10721 + 2774 + 1539 + 104)/nrow(ClaimsTest)
PenaltyMatrix = matrix(c(0,1,2,3,4,2,0,1,2,3,4,2,0,1,2,6,4,2,0,1,8,6,4,2,0), byrow=TRUE, nrow=5)
PenaltyMatrix
as.matrix(table(ClaimsTest\(bucket2009, ClaimsTest\)bucket2008))*PenaltyMatrix
sum(as.matrix(table(ClaimsTest\(bucket2009, ClaimsTest\)bucket2008))*PenaltyMatrix)/nrow(ClaimsTest)
library(rpart) library(rpart.plot)
ClaimsTree = rpart(bucket2009 ~ age + alzheimers + arthritis + cancer + copd + depression + diabetes + heart.failure + ihd + kidney + osteoporosis + stroke + bucket2008 + reimbursement2008, data=ClaimsTrain, method=“class”, cp=0.00005)
prp(ClaimsTree)
PredictTest = predict(ClaimsTree, newdata = ClaimsTest, type = “class”)
table(ClaimsTest$bucket2009, PredictTest)
(114141 + 16102 + 118 + 201 + 0)/nrow(ClaimsTest)
as.matrix(table(ClaimsTest$bucket2009, PredictTest))*PenaltyMatrix
sum(as.matrix(table(ClaimsTest$bucket2009, PredictTest))*PenaltyMatrix)/nrow(ClaimsTest)
ClaimsTree = rpart(bucket2009 ~ age + alzheimers + arthritis + cancer + copd + depression + diabetes + heart.failure + ihd + kidney + osteoporosis + stroke + bucket2008 + reimbursement2008, data=ClaimsTrain, method=“class”, cp=0.00005, parms=list(loss=PenaltyMatrix))
PredictTest = predict(ClaimsTree, newdata = ClaimsTest, type = “class”)
table(ClaimsTest$bucket2009, PredictTest)
(94310 + 18942 + 4692 + 636 + 2)/nrow(ClaimsTest)
sum(as.matrix(table(ClaimsTest$bucket2009, PredictTest))*PenaltyMatrix)/nrow(ClaimsTest) ```