This is my second attempt at applying CART analysis to the decisions of Justice Stevens.

stevens = read.csv("/Users/michaelreinhard/Google Drive/R/edx_analytics/stevens.csv")
str(stevens)
## 'data.frame':    566 obs. of  9 variables:
##  $ Docket    : Factor w/ 566 levels "00-1011","00-1045",..: 63 69 70 145 97 181 242 289 334 436 ...
##  $ Term      : int  1994 1994 1994 1994 1995 1995 1996 1997 1997 1999 ...
##  $ Circuit   : Factor w/ 13 levels "10th","11th",..: 4 11 7 3 9 11 13 11 12 2 ...
##  $ Issue     : Factor w/ 11 levels "Attorneys","CivilRights",..: 5 5 5 5 9 5 5 5 5 3 ...
##  $ Petitioner: Factor w/ 12 levels "AMERICAN.INDIAN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Respondent: Factor w/ 12 levels "AMERICAN.INDIAN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ LowerCourt: Factor w/ 2 levels "conser","liberal": 2 2 2 1 1 1 1 1 1 1 ...
##  $ Unconst   : int  0 0 0 0 0 1 0 1 0 0 ...
##  $ Reverse   : int  1 1 1 1 1 0 1 1 1 1 ...
library(caTools)
set.seed(3000)
spl = sample.split(stevens$Reverse, SplitRatio = 0.7)
train = subset(stevens, spl==TRUE)
test = subset(stevens, spl==FALSE)
library(rpart)
library(rpart.plot)

Now I run the actual models.

stevensTree = rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data= train, method="class", minbucket=25)
prp(stevensTree)

plot of chunk unnamed-chunk-2 The tree comes out beautifully, just like it did in the video.

Now I want to compare the model’s performance with a naive prediction of simply taking the most common outcome and predicting that for all cases.

First I use the predict() function on the test data to generate predictions and compare them with the actual outcomes in the test set using the table function.

predictCART = predict(stevensTree, newdata = test, type = "class")
table(test$Reverse, predictCART)
##    predictCART
##      0  1
##   0 41 36
##   1 22 71
table(test$Reverse)
## 
##  0  1 
## 77 93
93/(93+77)
## [1] 0.5471
(41+71)/(41+36+22+71)
## [1] 0.6588

So the baseline model would predict that he would always reverse and would be right 55% of the time in the test data set, whereas the CART trees correctly predict Stevens’ decisions to reverse or affirm 66% of the time.

Now we predict the amount of the cases that will be “under the curve” with the ROC model.

library(ROCR)
## Loading required package: gplots
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
predictROC = predict(stevensTree, newdata = test)
predictROC
##           0      1
## 1   0.30357 0.6964
## 3   0.30357 0.6964
## 4   0.40000 0.6000
## 6   0.40000 0.6000
## 8   0.40000 0.6000
## 21  0.30357 0.6964
## 32  0.55172 0.4483
## 36  0.55172 0.4483
## 40  0.30357 0.6964
## 42  0.55172 0.4483
## 46  0.55172 0.4483
## 47  0.40000 0.6000
## 53  0.55172 0.4483
## 55  0.30357 0.6964
## 59  0.18421 0.8158
## 60  0.40000 0.6000
## 66  0.40000 0.6000
## 67  0.40000 0.6000
## 68  0.18421 0.8158
## 72  0.30357 0.6964
## 79  0.30357 0.6964
## 80  0.55172 0.4483
## 87  0.76000 0.2400
## 88  0.18421 0.8158
## 92  0.79104 0.2090
## 95  0.79104 0.2090
## 102 0.79104 0.2090
## 106 0.79104 0.2090
## 110 0.79104 0.2090
## 112 0.79104 0.2090
## 114 0.79104 0.2090
## 125 0.79104 0.2090
## 130 0.79104 0.2090
## 134 0.79104 0.2090
## 138 0.79104 0.2090
## 145 0.79104 0.2090
## 146 0.79104 0.2090
## 148 0.30357 0.6964
## 149 0.30357 0.6964
## 152 0.30357 0.6964
## 154 0.55172 0.4483
## 161 0.78788 0.2121
## 164 0.40000 0.6000
## 167 0.78788 0.2121
## 169 0.30357 0.6964
## 171 0.76000 0.2400
## 175 0.55172 0.4483
## 176 0.07547 0.9245
## 177 0.07547 0.9245
## 178 0.07547 0.9245
## 180 0.07547 0.9245
## 187 0.07547 0.9245
## 188 0.78788 0.2121
## 190 0.07547 0.9245
## 192 0.07547 0.9245
## 196 0.07547 0.9245
## 197 0.30357 0.6964
## 208 0.30357 0.6964
## 210 0.07547 0.9245
## 216 0.79104 0.2090
## 218 0.79104 0.2090
## 220 0.07547 0.9245
## 224 0.40000 0.6000
## 226 0.76000 0.2400
## 227 0.40000 0.6000
## 228 0.78788 0.2121
## 235 0.30357 0.6964
## 239 0.78788 0.2121
## 242 0.76000 0.2400
## 244 0.76000 0.2400
## 247 0.40000 0.6000
## 255 0.30357 0.6964
## 260 0.55172 0.4483
## 261 0.76000 0.2400
## 264 0.30357 0.6964
## 265 0.30357 0.6964
## 268 0.30357 0.6964
## 272 0.55172 0.4483
## 273 0.30357 0.6964
## 274 0.55172 0.4483
## 275 0.30357 0.6964
## 282 0.40000 0.6000
## 286 0.78788 0.2121
## 291 0.40000 0.6000
## 294 0.18421 0.8158
## 305 0.40000 0.6000
## 306 0.30357 0.6964
## 308 0.78788 0.2121
## 311 0.78788 0.2121
## 313 0.78788 0.2121
## 314 0.78788 0.2121
## 315 0.78788 0.2121
## 317 0.78788 0.2121
## 320 0.78788 0.2121
## 321 0.78788 0.2121
## 323 0.40000 0.6000
## 331 0.30357 0.6964
## 335 0.30357 0.6964
## 338 0.76000 0.2400
## 341 0.55172 0.4483
## 345 0.55172 0.4483
## 346 0.30357 0.6964
## 350 0.30357 0.6964
## 352 0.30357 0.6964
## 353 0.18421 0.8158
## 355 0.30357 0.6964
## 356 0.18421 0.8158
## 358 0.30357 0.6964
## 359 0.30357 0.6964
## 360 0.40000 0.6000
## 361 0.40000 0.6000
## 362 0.55172 0.4483
## 364 0.30357 0.6964
## 368 0.30357 0.6964
## 381 0.40000 0.6000
## 382 0.18421 0.8158
## 384 0.30357 0.6964
## 387 0.18421 0.8158
## 389 0.30357 0.6964
## 390 0.40000 0.6000
## 394 0.30357 0.6964
## 400 0.78788 0.2121
## 402 0.40000 0.6000
## 405 0.78788 0.2121
## 408 0.30357 0.6964
## 410 0.30357 0.6964
## 416 0.40000 0.6000
## 422 0.76000 0.2400
## 432 0.07547 0.9245
## 434 0.79104 0.2090
## 436 0.07547 0.9245
## 441 0.79104 0.2090
## 444 0.07547 0.9245
## 448 0.07547 0.9245
## 450 0.07547 0.9245
## 451 0.07547 0.9245
## 452 0.79104 0.2090
## 454 0.07547 0.9245
## 456 0.07547 0.9245
## 459 0.07547 0.9245
## 462 0.07547 0.9245
## 464 0.07547 0.9245
## 467 0.07547 0.9245
## 468 0.07547 0.9245
## 470 0.07547 0.9245
## 473 0.07547 0.9245
## 476 0.07547 0.9245
## 478 0.07547 0.9245
## 480 0.07547 0.9245
## 482 0.07547 0.9245
## 483 0.07547 0.9245
## 484 0.07547 0.9245
## 494 0.79104 0.2090
## 498 0.18421 0.8158
## 504 0.40000 0.6000
## 509 0.40000 0.6000
## 521 0.76000 0.2400
## 527 0.40000 0.6000
## 531 0.40000 0.6000
## 535 0.40000 0.6000
## 538 0.76000 0.2400
## 539 0.18421 0.8158
## 540 0.40000 0.6000
## 543 0.76000 0.2400
## 545 0.40000 0.6000
## 546 0.79104 0.2090
## 551 0.79104 0.2090
## 552 0.79104 0.2090
## 556 0.40000 0.6000
## 558 0.18421 0.8158
pred = prediction(predictROC[,2], test$Reverse)
perf = performance(pred, "tpr", "fpr")
plot(perf)

plot of chunk unnamed-chunk-4

Things that I learned from this session:

So all of the file loading worked once I got the path right. Remember to begin the path with a “/” so that the program knows it is an absolute path.

Command + Return runs one line of code.

The brackets have to have a lower case ‘r’ to turn text into code that r will execute.

If you just run predict() with a model and new data you get the probabilities for each case. What you want is to run type=“class” so you get a 0-1 outcome.

nrow(test$Reverse) nrow(predictCART) These will both return NULL because there are no rows in the data set? It is only one row of data so, but even then, shouldn’t it return 1 for row? Anyway, the table worked fine (and how does that work? how do you have a table out of something that has no rows?)