This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# Unit 4 - "Keeping an Eye on Healthcare Costs" Lecture
#setwd("C:/Users/aaron/Desktop/MIT") VIDEO 6
# Read in the data
Claims = read.csv("ClaimsData.csv")
str(Claims)
## 'data.frame': 458005 obs. of 16 variables:
## $ age : int 85 59 67 52 67 68 75 70 67 67 ...
## $ alzheimers : int 0 0 0 0 0 0 0 0 0 0 ...
## $ arthritis : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cancer : int 0 0 0 0 0 0 0 0 0 0 ...
## $ copd : int 0 0 0 0 0 0 0 0 0 0 ...
## $ depression : int 0 0 0 0 0 0 0 0 0 0 ...
## $ diabetes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ heart.failure : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ihd : int 0 0 0 0 0 0 0 0 0 0 ...
## $ kidney : int 0 0 0 0 0 0 0 0 0 0 ...
## $ osteoporosis : int 0 0 0 0 0 0 0 0 0 0 ...
## $ stroke : int 0 0 0 0 0 0 0 0 0 0 ...
## $ reimbursement2008: int 0 0 0 0 0 0 0 0 0 0 ...
## $ bucket2008 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ reimbursement2009: int 0 0 0 0 0 0 0 0 0 0 ...
## $ bucket2009 : int 1 1 1 1 1 1 1 1 1 1 ...
# Percentage of patients in each cost bucket
table(Claims$bucket2009)/nrow(Claims)
##
## 1 2 3 4 5
## 0.671267781 0.190170413 0.089466272 0.043324855 0.005770679
# Split the data
library(caTools)
## Warning: package 'caTools' was built under R version 3.1.3
set.seed(88)
spl = sample.split(Claims$bucket2009, SplitRatio = 0.6)
ClaimsTrain = subset(Claims, spl==TRUE)
ClaimsTest = subset(Claims, spl==FALSE)
# VIDEO 7
# Baseline method
table(ClaimsTest$bucket2009, ClaimsTest$bucket2008)
##
## 1 2 3 4 5
## 1 110138 7787 3427 1452 174
## 2 16000 10721 4629 2931 559
## 3 7006 4629 2774 1621 360
## 4 2688 1943 1415 1539 352
## 5 293 191 160 309 104
(110138 + 10721 + 2774 + 1539 + 104)/nrow(ClaimsTest)
## [1] 0.6838135
# Penalty Matrix
PenaltyMatrix = matrix(c(0,1,2,3,4,2,0,1,2,3,4,2,0,1,2,6,4,2,0,1,8,6,4,2,0), byrow=TRUE, nrow=5)
PenaltyMatrix
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0 1 2 3 4
## [2,] 2 0 1 2 3
## [3,] 4 2 0 1 2
## [4,] 6 4 2 0 1
## [5,] 8 6 4 2 0
# Penalty Error of Baseline Method
as.matrix(table(ClaimsTest$bucket2009, ClaimsTest$bucket2008))*PenaltyMatrix
##
## 1 2 3 4 5
## 1 0 7787 6854 4356 696
## 2 32000 0 4629 5862 1677
## 3 28024 9258 0 1621 720
## 4 16128 7772 2830 0 352
## 5 2344 1146 640 618 0
sum(as.matrix(table(ClaimsTest$bucket2009, ClaimsTest$bucket2008))*PenaltyMatrix)/nrow(ClaimsTest)
## [1] 0.7386055
# VIDEO 8
# Load necessary libraries
library(rpart)
## Warning: package 'rpart' was built under R version 3.1.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.1.3
# CART model
ClaimsTree = rpart(bucket2009 ~ age + alzheimers + arthritis + cancer + copd + depression + diabetes + heart.failure + ihd + kidney + osteoporosis + stroke + bucket2008 + reimbursement2008, data=ClaimsTrain, method="class", cp=0.00005)
prp(ClaimsTree)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
# Make predictions
PredictTest = predict(ClaimsTree, newdata = ClaimsTest, type = "class")
table(ClaimsTest$bucket2009, PredictTest)
## PredictTest
## 1 2 3 4 5
## 1 114141 8610 124 103 0
## 2 18409 16102 187 142 0
## 3 8027 8146 118 99 0
## 4 3099 4584 53 201 0
## 5 351 657 4 45 0
(114141 + 16102 + 118 + 201 + 0)/nrow(ClaimsTest)
## [1] 0.7126669
# Penalty Error
as.matrix(table(ClaimsTest$bucket2009, PredictTest))*PenaltyMatrix
## PredictTest
## 1 2 3 4 5
## 1 0 8610 248 309 0
## 2 36818 0 187 284 0
## 3 32108 16292 0 99 0
## 4 18594 18336 106 0 0
## 5 2808 3942 16 90 0
sum(as.matrix(table(ClaimsTest$bucket2009, PredictTest))*PenaltyMatrix)/nrow(ClaimsTest)
## [1] 0.7578902
# New CART model with loss matrix
ClaimsTree = rpart(bucket2009 ~ age + alzheimers + arthritis + cancer + copd + depression + diabetes + heart.failure + ihd + kidney + osteoporosis + stroke + bucket2008 + reimbursement2008, data=ClaimsTrain, method="class", cp=0.00005, parms=list(loss=PenaltyMatrix))
# Redo predictions and penalty error
PredictTest = predict(ClaimsTree, newdata = ClaimsTest, type = "class")
table(ClaimsTest$bucket2009, PredictTest)
## PredictTest
## 1 2 3 4 5
## 1 94310 25295 3087 286 0
## 2 7176 18942 8079 643 0
## 3 3590 7706 4692 401 1
## 4 1304 3193 2803 636 1
## 5 135 356 408 156 2
(94310 + 18942 + 4692 + 636 + 2)/nrow(ClaimsTest)
## [1] 0.6472746
sum(as.matrix(table(ClaimsTest$bucket2009, PredictTest))*PenaltyMatrix)/nrow(ClaimsTest)
## [1] 0.6418161