This is my second attempt at applying CART analysis to the decisions of Justice Stevens.
stevens = read.csv("/Users/michaelreinhard/Google Drive/R/edx_analytics/stevens.csv")
str(stevens)
## 'data.frame': 566 obs. of 9 variables:
## $ Docket : Factor w/ 566 levels "00-1011","00-1045",..: 63 69 70 145 97 181 242 289 334 436 ...
## $ Term : int 1994 1994 1994 1994 1995 1995 1996 1997 1997 1999 ...
## $ Circuit : Factor w/ 13 levels "10th","11th",..: 4 11 7 3 9 11 13 11 12 2 ...
## $ Issue : Factor w/ 11 levels "Attorneys","CivilRights",..: 5 5 5 5 9 5 5 5 5 3 ...
## $ Petitioner: Factor w/ 12 levels "AMERICAN.INDIAN",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Respondent: Factor w/ 12 levels "AMERICAN.INDIAN",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ LowerCourt: Factor w/ 2 levels "conser","liberal": 2 2 2 1 1 1 1 1 1 1 ...
## $ Unconst : int 0 0 0 0 0 1 0 1 0 0 ...
## $ Reverse : int 1 1 1 1 1 0 1 1 1 1 ...
library(caTools)
set.seed(3000)
spl = sample.split(stevens$Reverse, SplitRatio = 0.7)
train = subset(stevens, spl==TRUE)
test = subset(stevens, spl==FALSE)
library(rpart)
library(rpart.plot)
Now I run the actual models.
stevensTree = rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data= train, method="class", minbucket=25)
prp(stevensTree)
The tree comes out beautifully, just like it did in the video.
Now I want to compare the model’s performance with a naive prediction of simply taking the most common outcome and predicting that for all cases.
First I use the predict() function on the test data to generate predictions and compare them with the actual outcomes in the test set using the table function.
predictCART = predict(stevensTree, newdata = test, type = "class")
table(test$Reverse, predictCART)
## predictCART
## 0 1
## 0 41 36
## 1 22 71
table(test$Reverse)
##
## 0 1
## 77 93
93/(93+77)
## [1] 0.5471
(41+71)/(41+36+22+71)
## [1] 0.6588
So the baseline model would predict that he would always reverse and would be right 55% of the time in the test data set, whereas the CART trees correctly predict Stevens’ decisions to reverse or affirm 66% of the time.
Now we predict the amount of the cases that will be “under the curve” with the ROC model.
library(ROCR)
## Loading required package: gplots
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
predictROC = predict(stevensTree, newdata = test)
predictROC
## 0 1
## 1 0.30357 0.6964
## 3 0.30357 0.6964
## 4 0.40000 0.6000
## 6 0.40000 0.6000
## 8 0.40000 0.6000
## 21 0.30357 0.6964
## 32 0.55172 0.4483
## 36 0.55172 0.4483
## 40 0.30357 0.6964
## 42 0.55172 0.4483
## 46 0.55172 0.4483
## 47 0.40000 0.6000
## 53 0.55172 0.4483
## 55 0.30357 0.6964
## 59 0.18421 0.8158
## 60 0.40000 0.6000
## 66 0.40000 0.6000
## 67 0.40000 0.6000
## 68 0.18421 0.8158
## 72 0.30357 0.6964
## 79 0.30357 0.6964
## 80 0.55172 0.4483
## 87 0.76000 0.2400
## 88 0.18421 0.8158
## 92 0.79104 0.2090
## 95 0.79104 0.2090
## 102 0.79104 0.2090
## 106 0.79104 0.2090
## 110 0.79104 0.2090
## 112 0.79104 0.2090
## 114 0.79104 0.2090
## 125 0.79104 0.2090
## 130 0.79104 0.2090
## 134 0.79104 0.2090
## 138 0.79104 0.2090
## 145 0.79104 0.2090
## 146 0.79104 0.2090
## 148 0.30357 0.6964
## 149 0.30357 0.6964
## 152 0.30357 0.6964
## 154 0.55172 0.4483
## 161 0.78788 0.2121
## 164 0.40000 0.6000
## 167 0.78788 0.2121
## 169 0.30357 0.6964
## 171 0.76000 0.2400
## 175 0.55172 0.4483
## 176 0.07547 0.9245
## 177 0.07547 0.9245
## 178 0.07547 0.9245
## 180 0.07547 0.9245
## 187 0.07547 0.9245
## 188 0.78788 0.2121
## 190 0.07547 0.9245
## 192 0.07547 0.9245
## 196 0.07547 0.9245
## 197 0.30357 0.6964
## 208 0.30357 0.6964
## 210 0.07547 0.9245
## 216 0.79104 0.2090
## 218 0.79104 0.2090
## 220 0.07547 0.9245
## 224 0.40000 0.6000
## 226 0.76000 0.2400
## 227 0.40000 0.6000
## 228 0.78788 0.2121
## 235 0.30357 0.6964
## 239 0.78788 0.2121
## 242 0.76000 0.2400
## 244 0.76000 0.2400
## 247 0.40000 0.6000
## 255 0.30357 0.6964
## 260 0.55172 0.4483
## 261 0.76000 0.2400
## 264 0.30357 0.6964
## 265 0.30357 0.6964
## 268 0.30357 0.6964
## 272 0.55172 0.4483
## 273 0.30357 0.6964
## 274 0.55172 0.4483
## 275 0.30357 0.6964
## 282 0.40000 0.6000
## 286 0.78788 0.2121
## 291 0.40000 0.6000
## 294 0.18421 0.8158
## 305 0.40000 0.6000
## 306 0.30357 0.6964
## 308 0.78788 0.2121
## 311 0.78788 0.2121
## 313 0.78788 0.2121
## 314 0.78788 0.2121
## 315 0.78788 0.2121
## 317 0.78788 0.2121
## 320 0.78788 0.2121
## 321 0.78788 0.2121
## 323 0.40000 0.6000
## 331 0.30357 0.6964
## 335 0.30357 0.6964
## 338 0.76000 0.2400
## 341 0.55172 0.4483
## 345 0.55172 0.4483
## 346 0.30357 0.6964
## 350 0.30357 0.6964
## 352 0.30357 0.6964
## 353 0.18421 0.8158
## 355 0.30357 0.6964
## 356 0.18421 0.8158
## 358 0.30357 0.6964
## 359 0.30357 0.6964
## 360 0.40000 0.6000
## 361 0.40000 0.6000
## 362 0.55172 0.4483
## 364 0.30357 0.6964
## 368 0.30357 0.6964
## 381 0.40000 0.6000
## 382 0.18421 0.8158
## 384 0.30357 0.6964
## 387 0.18421 0.8158
## 389 0.30357 0.6964
## 390 0.40000 0.6000
## 394 0.30357 0.6964
## 400 0.78788 0.2121
## 402 0.40000 0.6000
## 405 0.78788 0.2121
## 408 0.30357 0.6964
## 410 0.30357 0.6964
## 416 0.40000 0.6000
## 422 0.76000 0.2400
## 432 0.07547 0.9245
## 434 0.79104 0.2090
## 436 0.07547 0.9245
## 441 0.79104 0.2090
## 444 0.07547 0.9245
## 448 0.07547 0.9245
## 450 0.07547 0.9245
## 451 0.07547 0.9245
## 452 0.79104 0.2090
## 454 0.07547 0.9245
## 456 0.07547 0.9245
## 459 0.07547 0.9245
## 462 0.07547 0.9245
## 464 0.07547 0.9245
## 467 0.07547 0.9245
## 468 0.07547 0.9245
## 470 0.07547 0.9245
## 473 0.07547 0.9245
## 476 0.07547 0.9245
## 478 0.07547 0.9245
## 480 0.07547 0.9245
## 482 0.07547 0.9245
## 483 0.07547 0.9245
## 484 0.07547 0.9245
## 494 0.79104 0.2090
## 498 0.18421 0.8158
## 504 0.40000 0.6000
## 509 0.40000 0.6000
## 521 0.76000 0.2400
## 527 0.40000 0.6000
## 531 0.40000 0.6000
## 535 0.40000 0.6000
## 538 0.76000 0.2400
## 539 0.18421 0.8158
## 540 0.40000 0.6000
## 543 0.76000 0.2400
## 545 0.40000 0.6000
## 546 0.79104 0.2090
## 551 0.79104 0.2090
## 552 0.79104 0.2090
## 556 0.40000 0.6000
## 558 0.18421 0.8158
pred = prediction(predictROC[,2], test$Reverse)
perf = performance(pred, "tpr", "fpr")
plot(perf)
Things that I learned from this session:
So all of the file loading worked once I got the path right. Remember to begin the path with a “/” so that the program knows it is an absolute path.
Command + Return runs one line of code.
The brackets have to have a lower case ‘r’ to turn text into code that r will execute.
If you just run predict() with a model and new data you get the probabilities for each case. What you want is to run type=“class” so you get a 0-1 outcome.
nrow(test$Reverse) nrow(predictCART) These will both return NULL because there are no rows in the data set? It is only one row of data so, but even then, shouldn’t it return 1 for row? Anyway, the table worked fine (and how does that work? how do you have a table out of something that has no rows?)