Unit 4 - “Keeping an Eye on Healthcare Costs” Lecture

VIDEO 6

setwd('C:/Users/daria.alekseeva/Documents/Edx/Trees/')
# Read in the data
Claims = read.csv("ClaimsData.csv/ClaimsData.csv")

str(Claims)

## 'data.frame':    458005 obs. of  16 variables:
##  $ age              : int  85 59 67 52 67 68 75 70 67 67 ...
##  $ alzheimers       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ arthritis        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cancer           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ copd             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ depression       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ diabetes         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ heart.failure    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ihd              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ kidney           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ osteoporosis     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ stroke           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ reimbursement2008: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ bucket2008       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ reimbursement2009: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ bucket2009       : int  1 1 1 1 1 1 1 1 1 1 ...

# Percentage of patients in each cost bucket
table(Claims$bucket2009)/nrow(Claims)

## 
##           1           2           3           4           5 
## 0.671267781 0.190170413 0.089466272 0.043324855 0.005770679

# Split the data
library(caTools)

set.seed(88)

spl = sample.split(Claims$bucket2009, SplitRatio = 0.6)

ClaimsTrain = subset(Claims, spl==TRUE)

ClaimsTest = subset(Claims, spl==FALSE)

What is the average age of patients in the training set, ClaimsTrain?

mean(ClaimsTrain$age)

## [1] 72.63773

What proportion of people in the training set (ClaimsTrain) had at least one diagnosis code for diabetes?

nrow(ClaimsTrain[ClaimsTrain$diabetes>=1,]) / nrow(ClaimsTrain)

## [1] 0.3808983

VIDEO 7

Baseline method

table(ClaimsTest$bucket2009, ClaimsTest$bucket2008)

(110138 + 10721 + 2774 + 1539 + 104)/nrow(ClaimsTest)

Penalty Matrix

PenaltyMatrix = matrix(c(0,1,2,3,4,2,0,1,2,3,4,2,0,1,2,6,4,2,0,1,8,6,4,2,0), byrow=TRUE, nrow=5)

PenaltyMatrix

Penalty Error of Baseline Method

as.matrix(table(ClaimsTest$bucket2009, ClaimsTest$bucket2008))*PenaltyMatrix

sum(as.matrix(table(ClaimsTest$bucket2009, ClaimsTest$bucket2008))*PenaltyMatrix)/nrow(ClaimsTest)

VIDEO 8

Load necessary libraries

library(rpart) library(rpart.plot)

CART model

ClaimsTree = rpart(bucket2009 ~ age + alzheimers + arthritis + cancer + copd + depression + diabetes + heart.failure + ihd + kidney + osteoporosis + stroke + bucket2008 + reimbursement2008, data=ClaimsTrain, method=“class”, cp=0.00005)

prp(ClaimsTree)

Make predictions

PredictTest = predict(ClaimsTree, newdata = ClaimsTest, type = “class”)

table(ClaimsTest$bucket2009, PredictTest)

(114141 + 16102 + 118 + 201 + 0)/nrow(ClaimsTest)

Penalty Error

as.matrix(table(ClaimsTest$bucket2009, PredictTest))*PenaltyMatrix

sum(as.matrix(table(ClaimsTest$bucket2009, PredictTest))*PenaltyMatrix)/nrow(ClaimsTest)

New CART model with loss matrix

Redo predictions and penalty error

PredictTest = predict(ClaimsTree, newdata = ClaimsTest, type = “class”)

table(ClaimsTest$bucket2009, PredictTest)

(94310 + 18942 + 4692 + 636 + 2)/nrow(ClaimsTest)

sum(as.matrix(table(ClaimsTest$bucket2009, PredictTest))*PenaltyMatrix)/nrow(ClaimsTest) ```