stevens = read.csv("stevens.csv")
str(stevens)
## 'data.frame': 566 obs. of 9 variables:
## $ Docket : Factor w/ 566 levels "00-1011","00-1045",..: 63 69 70 145 97 181 242 289 334 436 ...
## $ Term : int 1994 1994 1994 1994 1995 1995 1996 1997 1997 1999 ...
## $ Circuit : Factor w/ 13 levels "10th","11th",..: 4 11 7 3 9 11 13 11 12 2 ...
## $ Issue : Factor w/ 11 levels "Attorneys","CivilRights",..: 5 5 5 5 9 5 5 5 5 3 ...
## $ Petitioner: Factor w/ 12 levels "AMERICAN.INDIAN",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Respondent: Factor w/ 12 levels "AMERICAN.INDIAN",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ LowerCourt: Factor w/ 2 levels "conser","liberal": 2 2 2 1 1 1 1 1 1 1 ...
## $ Unconst : int 0 0 0 0 0 1 0 1 0 0 ...
## $ Reverse : int 1 1 1 1 1 0 1 1 1 1 ...
Independent variables
Dependent variable
library(caTools)
set.seed(200)
spl = sample.split(stevens$Reverse, SplitRatio = 0.7)
Train = subset(stevens, spl==TRUE)
Test = subset(stevens, spl==FALSE)
#install.packages("rpart")
library(rpart)
## Warning: package 'rpart' was built under R version 3.2.3
#install.packages("rpart.plot")
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.2.3
StevensTree = rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, method="class", minbucket=25)
method="class" : This tells rpart to build a classification tree, instead of a regression treeminbucket=25 : This limits the tree, so that it doesn’t overfit to our training set. If minbucket=5, we have 16 splits and if minbucket=100, we only have 1 split.prp(StevensTree)
If the respondent is a criminal dependent, injured person, politician, state, or the United States, it well be an affirm (0)
A CART model is a series of decision rules which can easily be explained.
PredictCART = predict(StevensTree, newdata = Test, type = "class")
type="class" : we need to give this argument when making predictions for our CART model if we want the majority class predictions. That is, this is like using a treshold of 0.5table(Test$Reverse, PredictCART)
## PredictCART
## 0 1
## 0 35 42
## 1 26 67
(35+67)/(35+67+42+26)
## [1] 0.6
A baseline model that always predicts Reverse (i.e., the common outcome) has an accuracy of (26+67)/(26+67+35+42)=0.5470588. So, since 0.6 > 0.5470588, our CART model significantly beats the baseline and is competitive with logistic regression.
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
PredictROC = predict(StevensTree, newdata = Test)
PredictROC
## 0 1
## 4 0.1444444 0.8555556
## 5 0.1444444 0.8555556
## 6 0.1444444 0.8555556
## 7 0.1444444 0.8555556
## 8 0.1444444 0.8555556
## 14 0.3888889 0.6111111
## 21 0.3888889 0.6111111
## 28 0.3888889 0.6111111
## 34 0.1444444 0.8555556
## 35 0.1444444 0.8555556
## 36 0.6603774 0.3396226
## 42 0.3888889 0.6111111
## 44 0.1444444 0.8555556
## 55 0.3888889 0.6111111
## 58 0.3888889 0.6111111
## 60 0.1444444 0.8555556
## 66 0.1444444 0.8555556
## 67 0.1444444 0.8555556
## 68 0.1444444 0.8555556
## 70 0.6603774 0.3396226
## 71 0.3888889 0.6111111
## 72 0.3888889 0.6111111
## 77 0.1444444 0.8555556
## 78 0.6603774 0.3396226
## 79 0.3888889 0.6111111
## 87 0.6486486 0.3513514
## 95 0.8260870 0.1739130
## 101 0.8260870 0.1739130
## 102 0.8260870 0.1739130
## 107 0.8260870 0.1739130
## 112 0.8260870 0.1739130
## 116 0.8260870 0.1739130
## 120 0.8260870 0.1739130
## 127 0.8260870 0.1739130
## 133 0.8260870 0.1739130
## 138 0.8260870 0.1739130
## 139 0.8260870 0.1739130
## 140 0.8260870 0.1739130
## 141 0.8260870 0.1739130
## 153 0.3888889 0.6111111
## 154 0.6603774 0.3396226
## 161 0.6603774 0.3396226
## 165 0.3888889 0.6111111
## 166 0.3888889 0.6111111
## 169 0.3888889 0.6111111
## 174 0.6603774 0.3396226
## 175 0.6603774 0.3396226
## 179 0.1444444 0.8555556
## 181 0.1444444 0.8555556
## 183 0.1444444 0.8555556
## 190 0.1444444 0.8555556
## 196 0.1444444 0.8555556
## 197 0.3888889 0.6111111
## 198 0.6603774 0.3396226
## 204 0.3888889 0.6111111
## 211 0.8260870 0.1739130
## 213 0.3066667 0.6933333
## 219 0.8260870 0.1739130
## 221 0.8260870 0.1739130
## 225 0.3888889 0.6111111
## 227 0.3066667 0.6933333
## 231 0.3066667 0.6933333
## 235 0.3888889 0.6111111
## 241 0.3066667 0.6933333
## 242 0.3066667 0.6933333
## 243 0.3066667 0.6933333
## 244 0.3066667 0.6933333
## 245 0.3066667 0.6933333
## 246 0.6486486 0.3513514
## 247 0.3066667 0.6933333
## 251 0.6603774 0.3396226
## 260 0.6603774 0.3396226
## 266 0.6603774 0.3396226
## 268 0.3888889 0.6111111
## 270 0.6603774 0.3396226
## 274 0.6603774 0.3396226
## 282 0.3066667 0.6933333
## 285 0.3888889 0.6111111
## 290 0.3066667 0.6933333
## 292 0.3888889 0.6111111
## 297 0.3066667 0.6933333
## 298 0.6603774 0.3396226
## 300 0.3888889 0.6111111
## 301 0.3888889 0.6111111
## 303 0.3888889 0.6111111
## 307 0.3066667 0.6933333
## 310 0.3066667 0.6933333
## 311 0.6603774 0.3396226
## 312 0.3888889 0.6111111
## 315 0.6603774 0.3396226
## 320 0.3888889 0.6111111
## 322 0.6603774 0.3396226
## 325 0.6486486 0.3513514
## 328 0.6603774 0.3396226
## 329 0.3888889 0.6111111
## 332 0.3066667 0.6933333
## 337 0.3888889 0.6111111
## 338 0.6486486 0.3513514
## 345 0.3888889 0.6111111
## 348 0.3066667 0.6933333
## 355 0.6603774 0.3396226
## 357 0.3066667 0.6933333
## 359 0.3888889 0.6111111
## 360 0.6486486 0.3513514
## 366 0.3888889 0.6111111
## 371 0.3066667 0.6933333
## 373 0.3888889 0.6111111
## 376 0.3066667 0.6933333
## 378 0.3888889 0.6111111
## 380 0.3066667 0.6933333
## 383 0.3888889 0.6111111
## 387 0.3066667 0.6933333
## 393 0.3066667 0.6933333
## 403 0.6603774 0.3396226
## 405 0.6603774 0.3396226
## 407 0.6603774 0.3396226
## 410 0.3888889 0.6111111
## 412 0.6603774 0.3396226
## 414 0.3066667 0.6933333
## 424 0.3888889 0.6111111
## 430 0.1444444 0.8555556
## 434 0.8260870 0.1739130
## 439 0.1444444 0.8555556
## 448 0.1444444 0.8555556
## 453 0.1444444 0.8555556
## 456 0.1444444 0.8555556
## 457 0.1444444 0.8555556
## 459 0.1444444 0.8555556
## 461 0.1444444 0.8555556
## 462 0.1444444 0.8555556
## 463 0.1444444 0.8555556
## 464 0.1444444 0.8555556
## 465 0.1444444 0.8555556
## 466 0.1444444 0.8555556
## 467 0.1444444 0.8555556
## 472 0.1444444 0.8555556
## 473 0.1444444 0.8555556
## 475 0.1444444 0.8555556
## 478 0.1444444 0.8555556
## 480 0.1444444 0.8555556
## 481 0.1444444 0.8555556
## 488 0.1444444 0.8555556
## 490 0.1444444 0.8555556
## 493 0.8260870 0.1739130
## 495 0.1444444 0.8555556
## 503 0.3066667 0.6933333
## 505 0.6486486 0.3513514
## 508 0.6486486 0.3513514
## 510 0.6486486 0.3513514
## 511 0.3066667 0.6933333
## 512 0.6486486 0.3513514
## 516 0.3066667 0.6933333
## 517 0.3066667 0.6933333
## 521 0.6486486 0.3513514
## 524 0.6486486 0.3513514
## 525 0.6486486 0.3513514
## 529 0.3066667 0.6933333
## 533 0.3066667 0.6933333
## 536 0.3066667 0.6933333
## 540 0.6486486 0.3513514
## 541 0.6486486 0.3513514
## 542 0.6486486 0.3513514
## 546 0.8260870 0.1739130
## 548 0.3066667 0.6933333
## 551 0.8260870 0.1739130
## 554 0.6486486 0.3513514
## 557 0.6486486 0.3513514
## 558 0.3066667 0.6933333
## 561 0.3066667 0.6933333
## 563 0.8260870 0.1739130
More concretely, each test set observation is classified into a subset.
pred = prediction(PredictROC[,2], Test$Reverse) # we'll use the second column as our probabilities to generate an ROC Curve
perf = performance(pred, "tpr", "fpr")
plot(perf)
as.numeric(performance(pred, "auc")@y.values)
## [1] 0.6856584
#install.packages("randomForest")
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.2.3
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
StevensForest = randomForest(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, ntree=200, nodesize=25 )
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
Train$Reverse = as.factor(Train$Reverse)
Test$Reverse = as.factor(Test$Reverse)
In CART, we added the argument method="class", so that it was clear that we’re doing a classification problem. (Trees can also be used for regression problems.) On the other hand, the Random Forest function does not have a method argument. so, when we want to do a classification problem, we need to make sure outcome is a factor.
StevensForest = randomForest(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, ntree=200, nodesize=25 )
PredictForest = predict(StevensForest, newdata = Test)
table(Test$Reverse, PredictForest)
## PredictForest
## 0 1
## 0 43 34
## 1 20 73
(43+73)/(43+73+34+20)
## [1] 0.6823529
In general, in terms of accuracy, Logistic regression < CART(0.6) < Random Forest(0.6823529). As we see here, the random component of the random forest method can change the accuracy. The accuracy for a more stable dataset will not change very much, but a noisy dataset can be significantly affected by the random samples.
#install.packages("caret")
library(caret)
## Warning: package 'caret' was built under R version 3.2.3
## Loading required package: lattice
## Loading required package: ggplot2
#install.packages("e1071")
library(e1071)
## Warning: package 'e1071' was built under R version 3.2.3
numFolds = trainControl( method = "cv", number = 10 ) # Cross-Validation, k=10
cpGrid = expand.grid( .cp = seq(0.01,0.5,0.01))
We need to pick the possible values for cp parameters, using the expand.grid function
train(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, method = "rpart", trControl = numFolds, tuneGrid = cpGrid ) # "rpart" : CART model
## CART
##
## 396 samples
## 8 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 356, 357, 357, 356, 356, 357, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa Accuracy SD Kappa SD
## 0.01 0.6538462 0.29054260 0.062801614 0.12950848
## 0.02 0.6436538 0.27521047 0.055747081 0.11699416
## 0.03 0.6461538 0.28489865 0.048407694 0.10441368
## 0.04 0.6537179 0.30447768 0.043908879 0.09374528
## 0.05 0.6614103 0.32119925 0.047986905 0.10186976
## 0.06 0.6614103 0.32119925 0.047986905 0.10186976
## 0.07 0.6614103 0.32119925 0.047986905 0.10186976
## 0.08 0.6614103 0.32119925 0.047986905 0.10186976
## 0.09 0.6614103 0.32119925 0.047986905 0.10186976
## 0.10 0.6614103 0.32119925 0.047986905 0.10186976
## 0.11 0.6614103 0.32119925 0.047986905 0.10186976
## 0.12 0.6614103 0.32119925 0.047986905 0.10186976
## 0.13 0.6614103 0.32119925 0.047986905 0.10186976
## 0.14 0.6614103 0.32119925 0.047986905 0.10186976
## 0.15 0.6614103 0.32119925 0.047986905 0.10186976
## 0.16 0.6614103 0.32119925 0.047986905 0.10186976
## 0.17 0.6614103 0.32119925 0.047986905 0.10186976
## 0.18 0.6614103 0.32119925 0.047986905 0.10186976
## 0.19 0.6614103 0.32119925 0.047986905 0.10186976
## 0.20 0.6614103 0.32119925 0.047986905 0.10186976
## 0.21 0.6614103 0.32119925 0.047986905 0.10186976
## 0.22 0.6614103 0.32119925 0.047986905 0.10186976
## 0.23 0.6614103 0.32119925 0.047986905 0.10186976
## 0.24 0.6614103 0.32119925 0.047986905 0.10186976
## 0.25 0.6084615 0.18673042 0.048700889 0.13886578
## 0.26 0.5657051 0.06434802 0.032109989 0.10541042
## 0.27 0.5505128 0.01680000 0.014818650 0.05312626
## 0.28 0.5453846 0.00000000 0.005958436 0.00000000
## 0.29 0.5453846 0.00000000 0.005958436 0.00000000
## 0.30 0.5453846 0.00000000 0.005958436 0.00000000
## 0.31 0.5453846 0.00000000 0.005958436 0.00000000
## 0.32 0.5453846 0.00000000 0.005958436 0.00000000
## 0.33 0.5453846 0.00000000 0.005958436 0.00000000
## 0.34 0.5453846 0.00000000 0.005958436 0.00000000
## 0.35 0.5453846 0.00000000 0.005958436 0.00000000
## 0.36 0.5453846 0.00000000 0.005958436 0.00000000
## 0.37 0.5453846 0.00000000 0.005958436 0.00000000
## 0.38 0.5453846 0.00000000 0.005958436 0.00000000
## 0.39 0.5453846 0.00000000 0.005958436 0.00000000
## 0.40 0.5453846 0.00000000 0.005958436 0.00000000
## 0.41 0.5453846 0.00000000 0.005958436 0.00000000
## 0.42 0.5453846 0.00000000 0.005958436 0.00000000
## 0.43 0.5453846 0.00000000 0.005958436 0.00000000
## 0.44 0.5453846 0.00000000 0.005958436 0.00000000
## 0.45 0.5453846 0.00000000 0.005958436 0.00000000
## 0.46 0.5453846 0.00000000 0.005958436 0.00000000
## 0.47 0.5453846 0.00000000 0.005958436 0.00000000
## 0.48 0.5453846 0.00000000 0.005958436 0.00000000
## 0.49 0.5453846 0.00000000 0.005958436 0.00000000
## 0.50 0.5453846 0.00000000 0.005958436 0.00000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.24.
StevensTreeCV = rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, method="class", cp = 0.24)
PredictCV = predict(StevensTreeCV, newdata = Test, type = "class")
table(Test$Reverse, PredictCV)
## PredictCV
## 0 1
## 0 51 26
## 1 28 65
(51+65)/(51+65+26+28)
## [1] 0.6823529