Load data

stevens <- read.csv("stevens.csv")
str(stevens)
## 'data.frame':    566 obs. of  9 variables:
##  $ Docket    : Factor w/ 566 levels "00-1011","00-1045",..: 63 69 70 145 97 181 242 289 334 436 ...
##  $ Term      : int  1994 1994 1994 1994 1995 1995 1996 1997 1997 1999 ...
##  $ Circuit   : Factor w/ 13 levels "10th","11th",..: 4 11 7 3 9 11 13 11 12 2 ...
##  $ Issue     : Factor w/ 11 levels "Attorneys","CivilRights",..: 5 5 5 5 9 5 5 5 5 3 ...
##  $ Petitioner: Factor w/ 12 levels "AMERICAN.INDIAN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Respondent: Factor w/ 12 levels "AMERICAN.INDIAN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ LowerCourt: Factor w/ 2 levels "conser","liberal": 2 2 2 1 1 1 1 1 1 1 ...
##  $ Unconst   : int  0 0 0 0 0 1 0 1 0 0 ...
##  $ Reverse   : int  1 1 1 1 1 0 1 1 1 1 ...

Split data into training and test sets

library(caTools)
## Warning: package 'caTools' was built under R version 3.1.3
set.seed(3000)
spl <- sample.split(stevens$Reverse, SplitRatio=0.7)
train <- subset(stevens, spl == TRUE)
test <- subset(stevens, spl == FALSE)

Build CART model

library(rpart)
## Warning: package 'rpart' was built under R version 3.1.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.1.3
stevensTree <- rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent +
                     LowerCourt + Unconst, data=train, 
                     method="class", minbucket=25)
prp(stevensTree)

Predict using test data

predictCART <- predict(stevensTree, newdata=test, type="class")
confmat <- table(test$Reverse, predictCART)
N <- sum(confmat)
(accuracy <- (confmat[1,1] + confmat[2,2]) / N)
## [1] 0.6588235
baseline.accuracy <- sum(confmat[2,]) / N

Generate ROC curve

library(ROCR)
## Warning: package 'ROCR' was built under R version 3.1.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.1.3
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
predictROC <- predict(stevensTree, newdata=test)
head(predictROC)
##            0         1
## 1  0.3035714 0.6964286
## 3  0.3035714 0.6964286
## 4  0.4000000 0.6000000
## 6  0.4000000 0.6000000
## 8  0.4000000 0.6000000
## 21 0.3035714 0.6964286
pred <- prediction(predictROC[,2], test$Reverse)
perf <- performance(pred, "tpr", "fpr")
plot(perf)

Compute AUC

as.numeric(performance(pred, "auc")@y.values)
## [1] 0.6927105

Change minibucket to see how many splits:

small <- rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent +
             LowerCourt + Unconst, data=train, 
           method="class", minbucket=5)
prp(small)

big <- rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent +
             LowerCourt + Unconst, data=train, 
           method="class", minbucket=100)
prp(big)

Build random forest. Output variable needs to be factor so that it will do logisitic regression. Choose nodesize=25 (aka minibucket in CART)

library(randomForest)
## Warning: package 'randomForest' was built under R version 3.1.3
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
train$Reverse <- as.factor(train$Reverse)
test$Reverse <- as.factor(test$Reverse)
stevensForest <- randomForest(Reverse ~ Circuit + Issue + Petitioner + Respondent +
                                LowerCourt + Unconst, data=train,
                              nodesize=25, ntree=200)

Make prediction using test data

predictForest <- predict(stevensForest, newdata=test)
confmat <- table(test$Reverse, predictForest)
N <- sum(confmat)
(accuracy <- (confmat[1,1] + confmat[2,2]) / N)
## [1] 0.6882353

Sets seed to 100 and 200 and get accuracy

computeAccuracy <- function(seed) {
  set.seed(seed)
  forest <- randomForest(Reverse ~ Circuit + Issue + Petitioner + Respondent +
                           LowerCourt + Unconst, data=train,
                         nodesize=25, ntree=200)
  predictForest <- predict(forest, newdata=test)
  confmat <- table(test$Reverse, predictForest)
  (confmat[1,1] + confmat[2,2]) / sum(confmat)
}

(computeAccuracy(100))
## [1] 0.6882353
(computeAccuracy(200))
## [1] 0.7058824

How to determine minbucket? K-fold Cross validation!

library(caret)
## Warning: package 'caret' was built under R version 3.1.3
## Loading required package: lattice
## Loading required package: ggplot2
library(e1071)
## Warning: package 'e1071' was built under R version 3.1.3
kfolds <- 10
numFolds <- trainControl(method="cv", number=kfolds)
cpGrid <- expand.grid(.cp=seq(0.01, 0.5, 0.01))
(cpRes <- train(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt +
                 Unconst,
               data=train, method="rpart", trControl=numFolds, tuneGrid=cpGrid))
## CART 
## 
## 396 samples
##   8 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## 
## Summary of sample sizes: 357, 356, 357, 356, 356, 356, ... 
## 
## Resampling results across tuning parameters:
## 
##   cp    Accuracy   Kappa        Accuracy SD  Kappa SD  
##   0.01  0.6365385  0.252522710  0.045831216  0.10138314
##   0.02  0.6337179  0.248281522  0.061267954  0.12752003
##   0.03  0.6314103  0.251796733  0.053552823  0.11468412
##   0.04  0.6314103  0.253786180  0.053552823  0.11438075
##   0.05  0.6440385  0.282995035  0.062472910  0.13104160
##   0.06  0.6440385  0.282995035  0.062472910  0.13104160
##   0.07  0.6440385  0.282995035  0.062472910  0.13104160
##   0.08  0.6440385  0.282995035  0.062472910  0.13104160
##   0.09  0.6440385  0.282995035  0.062472910  0.13104160
##   0.10  0.6440385  0.282995035  0.062472910  0.13104160
##   0.11  0.6440385  0.282995035  0.062472910  0.13104160
##   0.12  0.6440385  0.282995035  0.062472910  0.13104160
##   0.13  0.6440385  0.282995035  0.062472910  0.13104160
##   0.14  0.6440385  0.282995035  0.062472910  0.13104160
##   0.15  0.6440385  0.282995035  0.062472910  0.13104160
##   0.16  0.6440385  0.282995035  0.062472910  0.13104160
##   0.17  0.6440385  0.282995035  0.062472910  0.13104160
##   0.18  0.6440385  0.282995035  0.062472910  0.13104160
##   0.19  0.6440385  0.282995035  0.062472910  0.13104160
##   0.20  0.6085897  0.193703966  0.058244587  0.14192310
##   0.21  0.5807692  0.121202966  0.046444754  0.12714614
##   0.22  0.5605128  0.062732119  0.032700267  0.09526381
##   0.23  0.5428846  0.003553299  0.008506582  0.01123652
##   0.24  0.5428846  0.003553299  0.008506582  0.01123652
##   0.25  0.5453846  0.000000000  0.005958436  0.00000000
##   0.26  0.5453846  0.000000000  0.005958436  0.00000000
##   0.27  0.5453846  0.000000000  0.005958436  0.00000000
##   0.28  0.5453846  0.000000000  0.005958436  0.00000000
##   0.29  0.5453846  0.000000000  0.005958436  0.00000000
##   0.30  0.5453846  0.000000000  0.005958436  0.00000000
##   0.31  0.5453846  0.000000000  0.005958436  0.00000000
##   0.32  0.5453846  0.000000000  0.005958436  0.00000000
##   0.33  0.5453846  0.000000000  0.005958436  0.00000000
##   0.34  0.5453846  0.000000000  0.005958436  0.00000000
##   0.35  0.5453846  0.000000000  0.005958436  0.00000000
##   0.36  0.5453846  0.000000000  0.005958436  0.00000000
##   0.37  0.5453846  0.000000000  0.005958436  0.00000000
##   0.38  0.5453846  0.000000000  0.005958436  0.00000000
##   0.39  0.5453846  0.000000000  0.005958436  0.00000000
##   0.40  0.5453846  0.000000000  0.005958436  0.00000000
##   0.41  0.5453846  0.000000000  0.005958436  0.00000000
##   0.42  0.5453846  0.000000000  0.005958436  0.00000000
##   0.43  0.5453846  0.000000000  0.005958436  0.00000000
##   0.44  0.5453846  0.000000000  0.005958436  0.00000000
##   0.45  0.5453846  0.000000000  0.005958436  0.00000000
##   0.46  0.5453846  0.000000000  0.005958436  0.00000000
##   0.47  0.5453846  0.000000000  0.005958436  0.00000000
##   0.48  0.5453846  0.000000000  0.005958436  0.00000000
##   0.49  0.5453846  0.000000000  0.005958436  0.00000000
##   0.50  0.5453846  0.000000000  0.005958436  0.00000000
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.19.
plot(cpRes$results$cp, cpRes$results$Accuracy, type="l", xlab="cp", ylab="accuracy")

Now create new model with cp parameter

stevensTreeCV <- rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent +
                         LowerCourt + Unconst,
                       data=train, method="class", cp=cpRes$bestTune)
predictCV <- predict(stevensTreeCV, newdata=test, type="class")
confmat <- table(test$Reverse, predictCV)
(confmat[1,1] + confmat[2,2]) / sum(confmat)
## [1] 0.7235294

Plot the tree

prp(stevensTreeCV)